From bd04f66e0bc38503d2fa3e005098bbb9904104a5 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 3 Apr 2025 18:23:52 +0200 Subject: [PATCH 01/55] Add prototype for sptrsv --- include/graphblas/base/blas2.hpp | 96 ++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp index 3b5e5df4e..c273ca158 100644 --- a/include/graphblas/base/blas2.hpp +++ b/include/graphblas/base/blas2.hpp @@ -337,6 +337,102 @@ namespace grb { return UNSUPPORTED; } + /** + * Computes \f$ x + y \f$, where \f$ y \f$ is solved from \f$ Ty=b \f$. + * + * Here, \f$ T, b \f$ are given while \f$ T \f$ additionally must be either + * lower- or upper-triangular. The output contribution by \f$ y \f$ furthermore + * may be masked. + * + * @param[in,out] x On input: a vector with sufficient capacity to store the + * union of any pre-existing nonzeroes in \f$ x \f$ plus the + * (masked) result of \f$ y \f$ when #grb::EXECUTE is + * given. + * On output: the result of the computation when + * #grb::EXECUTE is given, or a vector with the exact same + * contents as on input but with the capacity resized to + * ensure it could store the result of the requested + * computation if #grb::RESIZE is given. + * @param[in] x_mask The mask that acts on \f$ y \f$. This vector has either + * size zero or size equal to that of \a x. + * + * \note If \a x_mask has size zero, it is interpreted as though the operation + * unmasked. + * + * @param[in] T The upper- or lower-triangular input matrix. This must be + * a square matrix with size equal to that of \a x. + * @param[in] b The right-hand side input vector. Its size must be equal + * to that of \a x. + * @param[in] forward Whether to perform forward substitution (i.e., \f$ T \f$ + * is lower-triangular), or to perform backward substitution + * instead (i.e., \f$ T \f$ is upper-triangular). + * + * \note With the structural information ALP/Dense passes as template + * information, \a forward would not be required. + * + * @param[in] semiring The semiring under which to perform the SpTrsv. + * @param[in] subtraction The inverse of the additive operator of \a semiring. + * @param[in] division The inverse of the multiplicative operator of + * \a semiring. + * + * \note That is, this operation in fact requires a field structure and not a + * semiring. A future extension to ALP may provide such a structure + * explicitly. + * + * @param[in] phase The requested phase of the computation. Only + * #grb::EXECUTE and #grb::RESIZE are supported. + * + * \todo Expand documentation. + */ + template< + Descriptor descr = descriptors::no_operation, + class Semiring, + class Subtraction, + class Division, + typename IOType, typename InputType1, typename InputType2, + typename InputType3, typename InputType4, + typename Coords, typename RIT, typename CIT, typename NIT, + Backend backend + > + RC sptrsv( + Vector< IOType, backend, Coords > &x, + const Vector< InputType3, backend, Coords > &x_mask, + const Matrix< InputType2, backend, RIT, CIT, NIT > &T, + const Vector< InputType1, backend, Coords > &b, + const bool forward, + const Semiring &semiring = Semiring(), + const Subtraction &subtraction = Subtraction(), + const Division &division = Division(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Semiring >::value && + grb::is_operator< Subtraction >::value && + grb::is_operator< Division >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + !grb::is_object< InputType4 >::value, + void >::type * const = nullptr + ) { +#ifdef _DEBUG + std::cerr << "Selected backend does not implement masked grb::sptrsv\n"; +#endif +#ifndef NDEBUG + const bool selected_backend_does_not_support_masked_sptrsv = false; + assert( selected_backend_does_not_support_masked_sptrsv ); +#endif + (void) x; + (void) x_mask; + (void) T; + (void) b; + (void) semiring; + (void) subtraction; + (void) division; + (void) phase; + return UNSUPPORTED; + } + /** * Executes an arbitrary element-wise user-defined function \a f on all * nonzero elements of a given matrix \a A. From 7e139b0048abf0c03308c66ec33764dd051e9815 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 12:47:23 +0200 Subject: [PATCH 02/55] simplify spec --- include/graphblas/base/blas2.hpp | 55 +++++++++++++++----------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp index c273ca158..902c83a58 100644 --- a/include/graphblas/base/blas2.hpp +++ b/include/graphblas/base/blas2.hpp @@ -338,31 +338,33 @@ namespace grb { } /** - * Computes \f$ x + y \f$, where \f$ y \f$ is solved from \f$ Ty=b \f$. + * Computes \f$ x \f$ from \f$ Tx=b \f$. * * Here, \f$ T, b \f$ are given while \f$ T \f$ additionally must be either - * lower- or upper-triangular. The output contribution by \f$ y \f$ furthermore - * may be masked. - * - * @param[in,out] x On input: a vector with sufficient capacity to store the - * union of any pre-existing nonzeroes in \f$ x \f$ plus the - * (masked) result of \f$ y \f$ when #grb::EXECUTE is - * given. - * On output: the result of the computation when - * #grb::EXECUTE is given, or a vector with the exact same - * contents as on input but with the capacity resized to - * ensure it could store the result of the requested - * computation if #grb::RESIZE is given. - * @param[in] x_mask The mask that acts on \f$ y \f$. This vector has either + * lower- or upper-triangular. The output \f$ x \f$ may furthermore be masked. + * + * @param[in,out] xb On input: the vector b. On output: the vector x. + * When #grb::EXECUTE is given this vector must have + * sufficient capacity to store both any pre-existing + * nonzeroes in \f$ b \f$ plus the nonzeroes in the (masked) + * result of \f$ x \f$. + * When #grb::RESIZE is given, the vector contents are + * unchanged, however its capacity may be enlarged in order + * to ensure the above-described condition. + * + * \note On input, \a xb may contain both implicit and explicit zeroes. + * + * \note On output, \a xb may contain both implicit and explicit zeroes, even + * if on input it did not. + * + * @param[in] mask The mask that acts on \f$ x \f$. This vector has either * size zero or size equal to that of \a x. * - * \note If \a x_mask has size zero, it is interpreted as though the operation + * \note If \a mask has size zero, it is interpreted as though the operation * unmasked. * * @param[in] T The upper- or lower-triangular input matrix. This must be - * a square matrix with size equal to that of \a x. - * @param[in] b The right-hand side input vector. Its size must be equal - * to that of \a x. + * a square matrix with size equal to that of \a xb. * @param[in] forward Whether to perform forward substitution (i.e., \f$ T \f$ * is lower-triangular), or to perform backward substitution * instead (i.e., \f$ T \f$ is upper-triangular). @@ -390,15 +392,13 @@ namespace grb { class Subtraction, class Division, typename IOType, typename InputType1, typename InputType2, - typename InputType3, typename InputType4, typename Coords, typename RIT, typename CIT, typename NIT, Backend backend > RC sptrsv( - Vector< IOType, backend, Coords > &x, - const Vector< InputType3, backend, Coords > &x_mask, - const Matrix< InputType2, backend, RIT, CIT, NIT > &T, - const Vector< InputType1, backend, Coords > &b, + Vector< IOType, backend, Coords > &xb, + const Vector< InputType2, backend, Coords > &mask, + const Matrix< InputType1, backend, RIT, CIT, NIT > &T, const bool forward, const Semiring &semiring = Semiring(), const Subtraction &subtraction = Subtraction(), @@ -410,9 +410,7 @@ namespace grb { grb::is_operator< Division >::value && !grb::is_object< IOType >::value && !grb::is_object< InputType1 >::value && - !grb::is_object< InputType2 >::value && - !grb::is_object< InputType3 >::value && - !grb::is_object< InputType4 >::value, + !grb::is_object< InputType2 >::value, void >::type * const = nullptr ) { #ifdef _DEBUG @@ -422,10 +420,9 @@ namespace grb { const bool selected_backend_does_not_support_masked_sptrsv = false; assert( selected_backend_does_not_support_masked_sptrsv ); #endif - (void) x; - (void) x_mask; + (void) xb; + (void) mask; (void) T; - (void) b; (void) semiring; (void) subtraction; (void) division; From 01f7ed05595e95b55a818838c1c8cf40571b5829 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 15:37:58 +0200 Subject: [PATCH 03/55] First non-vectorised reference sptrsv implementation, not yet tested --- include/graphblas/reference/blas2.hpp | 254 ++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 7a4d3e135..2cfeac82b 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2216,6 +2216,260 @@ namespace grb { namespace internal { +#ifndef _H_GRB_REFERENCE_OMP_BLAS2 + /** Backend-independent sptrsv kernel code. */ + template< + typename IOType, typename InputType1, + typename IND, typename NIT, + class Semiring, class Subtraction + > + inline void sptrsv_kernel( + const Compressed_Storage< InputType1, IND, NIT > &crs, + const size_t i, + IOType * const x, + IOType &divBy, + const Semiring &semiring, + const Subtraction &subtraction + ) { + constexpr auto one = Semiring::template getOne< typename Semiring::D2 >(); + for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { + const typename Semiring::D2 val = crs.template getValue( k, one ); + const auto &ind = crs.row_index[ k ]; + if( ind == i ) { + divBy = val; + continue; + } + const typename Semiring::D3 tmp; + (void) grb::apply( tmp, val, x[ ind ], + semiring.getMultiplicativeOperator() ); + (void) grb::foldl( x[ i ], tmp, subtraction ); + } + } +#endif + + /** \internal Specialised dense unmasked sptrsv implementation */ + template< + Descriptor descr, + class Semiring, class Subtraction, class Division, + typename IOType, typename InputType1, + typename Coords, typename RIT, typename CIT, typename NIT + > + RC dense_unmasked_sptrsv( + Vector< IOType, reference, Coords > &xb, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const size_t &n, + const bool forward, + const Semiring &semiring, + const Subtraction &subtraction, + const Division &division, + const Phase &phase + ) { + // dynamic sanity checks + assert( grb::size( xb ) == n ); + assert( grb::nnz( xb ) == n ); + assert( grb::nrows( T ) == n ); + assert( grb::ncols( T ) == n ); + + // in dense unmasked, resize is a no-op + if( phase == grb::RESIZE ) { return grb::SUCCESS; } + + // only execute and resize are supported + assert( phase == grb::EXECUTE ); + + // get required data handles + IOType * const v_raw = internal::getRaw( xb ); + size_t new_nz = 0; + + // switch forward or backward solve + if( forward ) { + const auto &crs = internal::getCRS( T ); + for( size_t i = 0; i < n; ++i ) { + IOType divBy = semiring.template getZero< IOType >(); + assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); + sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction ); + (void) grb::foldl( v_raw[ i ], divBy, division ); + } + } else { + const auto &ccs = internal::getCCS( T ); + for( size_t i = n - 1; i < n; --i ) { + IOType divBy = semiring.template getZero< IOType >(); + assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); + sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction ); + (void) grb::foldl( v_raw[ i ], divBy, division ); + } + } + + // done + return grb::SUCCESS; + } + + /** + * \internal Implements sparse masked, sparse unmasked, and dense masked + * sptrsv. + */ + template< + Descriptor descr, + bool masked, bool sparse, + class Semiring, class Subtraction, class Division, + typename IOType, typename InputType1, typename InputType2, + typename Coords, typename RIT, typename CIT, typename NIT + > + RC generic_sptrsv( + Vector< IOType, reference, Coords > &xb, + const Vector< InputType2, reference, Coords > &mask, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const size_t &n, + const bool forward, + const Semiring &semiring, + const Subtraction &subtraction, + const Division &division, + const Phase &phase + ) { + static_assert( masked || sparse, "Internal logic error; please submit a bug " + "report" ); + assert( grb::size( xb ) == n ); + assert( !masked || grb::size( mask ) == n ); + assert( grb::nrows( T ) == n ); + assert( grb::ncols( T ) == n ); + (void) masked; + (void) sparse; + (void) semiring; + (void) subtraction; + (void) division; + (void) phase; + std::cerr << "Warning: masked sptrsv not yet implemented\n"; + return grb::UNSUPPORTED; + } + + } // end grb::internal + + template< + Descriptor descr = descriptors::no_operation, + class Semiring, class Subtraction, class Division, + typename IOType, typename InputType1, + typename Coords, typename RIT, typename CIT, typename NIT + > + RC sptrsv( + Vector< IOType, reference, Coords > &xb, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const bool forward, + const Semiring &semiring = Semiring(), + const Subtraction &subtraction = Subtraction(), + const Division &division = Division(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Semiring >::value && + grb::is_operator< Subtraction >::value && + grb::is_operator< Division >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value, + void >::type * const = nullptr + ) { + // check contract + constexpr bool dense = descr & descriptors::dense; + const size_t n = size( xb ); + if( grb::nrows( T ) != n ) { + std::cerr << "Error, sptrsv (unmasked): matrix should have a number of rows " + << "equal to the size of the input/output vector\n"; + return grb::ILLEGAL; + } + if( grb::ncols( T ) != n ) { + std::cerr << "Error, sptrsv (unmasked): matrix should have a number of " + << "columns equal to the size of the input/output vector\n"; + return grb::ILLEGAL; + } + if( dense && grb::nnz( xb ) < n ) { + std::cerr << "Error, sptrsv (unmasked): sparse vector given in conjunction " + << "with a dense descriptor\n"; + return grb::ILLEGAL; + } + + // check trivial + if( n == 0 ) { return grb::SUCCESS; } + + // check dense dispatch + if( dense || grb::nnz( xb ) == n ) { + return internal::dense_unmasked_sptrsv< descr >( xb, T, forward, semiring, + subtraction, division, phase ); + } else { + grb::Vector< IOType, reference, Coords > no_mask( 0 ); + return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T, + forward, semiring, subtraction, division, phase ); + } + } + + template< + Descriptor descr = descriptors::no_operation, + class Semiring, + class Subtraction, + class Division, + typename IOType, typename InputType1, typename InputType2, + typename Coords, typename RIT, typename CIT, typename NIT + > + RC sptrsv( + Vector< IOType, reference, Coords > &xb, + const Vector< InputType2, reference, Coords > &mask, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const bool forward, + const Semiring &semiring = Semiring(), + const Subtraction &subtraction = Subtraction(), + const Division &division = Division(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Semiring >::value && + grb::is_operator< Subtraction >::value && + grb::is_operator< Division >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value, + void >::type * const = nullptr + ) { + // check if can forward + if( grb::size( mask ) == 0 ) { + return sptrsv< descr >( xb, T, forward, semiring, subtraction, division, + phase ); + } + + // check contract + constexpr bool dense = descr & descriptors::dense; + const size_t n = size( xb ); + if( grb::size( mask ) != n ) { + std::cerr << "Error, grb::sptrsv (masked): mask and input/output vector " + << "should have the same size\n"; + return grb::ILLEGAL; + } + if( grb::nrows( T ) != n ) { + std::cerr << "Error, grb::sptrsv (masked): matrix should have a number of " + << "rows equal to the size of the input/output vector\n"; + return grb::ILLEGAL; + } + if( grb::ncols( T ) != n ) { + std::cerr << "Error, grb::sptrsv (masked): matrix should have a number of " + << "columns equal to the size of the input/output vector\n"; + return grb::ILLEGAL; + } + if( dense && grb::nnz( mask ) < n ) { + std::cerr << "Error, grb::sptrsv (masked): sparse mask but dense descriptor " + << "was given\n"; + return grb::ILLEGAL; + } + if( dense && grb::nnz( xb ) < n ) { + std::cerr << "Error, grb::sptrsv (masked): sparse input vector given but " + "dense descriptor was given\n"; + return grb::ILLEGAL; + } + + if( dense || (grb::nnz( xb ) == n && grb::nnz( mask ) == n) ) { + return internal::generic_sptrsv< descr, true, false >( xb, mask, T, n, + forward, semiring, subtraction, division, phase ); + } else { + return internal::generic_sptrsv< descr, true, true >( xb, mask, T, n, + forward, semiring, subtraction, division, phase ); + } + } + + namespace internal { + #ifndef _H_GRB_REFERENCE_OMP_BLAS2 /** * A nonzero wrapper for use with grb::eWiseLambda over matrices. From e8b921e8f3dbbc9c67c90deeec362625aa9fa43e Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 17:07:24 +0200 Subject: [PATCH 04/55] Add unmasked sptrsv to base spec --- include/graphblas/base/blas2.hpp | 46 ++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp index 902c83a58..7e3274ea4 100644 --- a/include/graphblas/base/blas2.hpp +++ b/include/graphblas/base/blas2.hpp @@ -430,6 +430,52 @@ namespace grb { return UNSUPPORTED; } + /** + * Computes \f$ x \f$ from \f$ Tx = b \f$, unmasked variant. + * + * \todo Extend documentation + */ + template< + Descriptor descr = descriptors::no_operation, + class Semiring, + class Subtraction, + class Division, + typename IOType, typename InputType1, + typename Coords, typename RIT, typename CIT, typename NIT, + Backend backend + > + RC sptrsv( + Vector< IOType, backend, Coords > &xb, + const Matrix< InputType1, backend, RIT, CIT, NIT > &T, + const bool forward, + const Semiring &semiring = Semiring(), + const Subtraction &subtraction = Subtraction(), + const Division &division = Division(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Semiring >::value && + grb::is_operator< Subtraction >::value && + grb::is_operator< Division >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value, + void >::type * const = nullptr + ) { +#ifdef _DEBUG + std::cerr << "Selected backend does not implement masked grb::sptrsv\n"; +#endif +#ifndef NDEBUG + const bool selected_backend_does_not_support_unmasked_sptrsv = false; + assert( selected_backend_does_not_support_unmasked_sptrsv ); +#endif + (void) xb; + (void) T; + (void) semiring; + (void) subtraction; + (void) division; + (void) phase; + return UNSUPPORTED; + } + /** * Executes an arbitrary element-wise user-defined function \a f on all * nonzero elements of a given matrix \a A. From f28db62d953ec6f79482deccb171f0a021d2305d Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 17:07:58 +0200 Subject: [PATCH 05/55] Bugfixes, remove compiler warnings, and some debug tracing --- include/graphblas/reference/blas2.hpp | 38 ++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 2cfeac82b..9b60c5d7d 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2231,15 +2231,23 @@ namespace grb { const Semiring &semiring, const Subtraction &subtraction ) { - constexpr auto one = Semiring::template getOne< typename Semiring::D2 >(); + constexpr auto one = + Semiring::template One< typename Semiring::D2 >::value(); for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { const typename Semiring::D2 val = crs.template getValue( k, one ); const auto &ind = crs.row_index[ k ]; - if( ind == i ) { + if( static_cast< size_t >(ind) == i ) { + #ifdef _DEBUG + std::cout << "\tskipping nonzero at " << i ", " << i << "\n"; + #endif divBy = val; continue; } - const typename Semiring::D3 tmp; + #ifdef _DEBUG + std::cout << "\t x[ " << i << " ] (" << x[i] << ") -= " << val << "* x[ " + << ind << " ] (" << x[ind] << ")\n"; + #endif + typename Semiring::D3 tmp; (void) grb::apply( tmp, val, x[ ind ], semiring.getMultiplicativeOperator() ); (void) grb::foldl( x[ i ], tmp, subtraction ); @@ -2278,7 +2286,6 @@ namespace grb { // get required data handles IOType * const v_raw = internal::getRaw( xb ); - size_t new_nz = 0; // switch forward or backward solve if( forward ) { @@ -2287,6 +2294,10 @@ namespace grb { IOType divBy = semiring.template getZero< IOType >(); assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction ); +#ifdef _DEBUG + std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; +#endif (void) grb::foldl( v_raw[ i ], divBy, division ); } } else { @@ -2295,6 +2306,10 @@ namespace grb { IOType divBy = semiring.template getZero< IOType >(); assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction ); +#ifdef _DEBUG + std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; +#endif (void) grb::foldl( v_raw[ i ], divBy, division ); } } @@ -2325,14 +2340,25 @@ namespace grb { const Division &division, const Phase &phase ) { + // static checks static_assert( masked || sparse, "Internal logic error; please submit a bug " "report" ); + + //dynamic checks assert( grb::size( xb ) == n ); assert( !masked || grb::size( mask ) == n ); assert( grb::nrows( T ) == n ); assert( grb::ncols( T ) == n ); + (void) masked; (void) sparse; +#ifdef NDEBUG + (void) xb; + (void) mask; + (void) T; + (void) n; +#endif + (void) forward; (void) semiring; (void) subtraction; (void) division; @@ -2389,11 +2415,11 @@ namespace grb { // check dense dispatch if( dense || grb::nnz( xb ) == n ) { - return internal::dense_unmasked_sptrsv< descr >( xb, T, forward, semiring, + return internal::dense_unmasked_sptrsv< descr >( xb, T, n, forward, semiring, subtraction, division, phase ); } else { grb::Vector< IOType, reference, Coords > no_mask( 0 ); - return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T, + return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T, n, forward, semiring, subtraction, division, phase ); } } From 6181cc3b27d96bb2f630489d2e3004d36f9c00ca Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 17:25:28 +0200 Subject: [PATCH 06/55] Add test for new sptrsv function --- tests/unit/CMakeLists.txt | 6 + tests/unit/dense_sptrsv.cpp | 320 ++++++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 tests/unit/dense_sptrsv.cpp diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 5cd03cf77..049af30e9 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -374,6 +374,12 @@ add_grb_executables( launch_benchmark_frommpi_manual launcherAndBenchmarker.cpp COMPILE_DEFINITIONS NO_LPF_AUTO_INIT ) +# tests for experimental features + +add_grb_executables( dense_sptrsv dense_sptrsv.cpp + BACKENDS reference reference_omp +) + # targets to list and build the test for this category get_property( unit_tests_list GLOBAL PROPERTY tests_category_unit ) add_custom_target( "list_tests_category_unit" diff --git a/tests/unit/dense_sptrsv.cpp b/tests/unit/dense_sptrsv.cpp new file mode 100644 index 000000000..c7138fe94 --- /dev/null +++ b/tests/unit/dense_sptrsv.cpp @@ -0,0 +1,320 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "graphblas/algorithms/matrix_factory.hpp" +#include "graphblas/utils/timer.hpp" + +#include "graphblas.hpp" + + +using namespace grb; + +struct input { + size_t n; + size_t rep; +}; + +struct output { + enum grb::RC error_code; + grb::utils::TimerResults times; +}; + +template< typename T, class Semiring > +bool checkVectorDenseAndEqualTo( + const grb::Vector< T > &x, const T val, struct output &out, + const Semiring &semiring +) { + size_t count = 0; + bool error = false; + std::set< size_t > localSet; + const size_t n = grb::size( x ); + for( const auto &pair : x ) { + (void) ++count; + localSet.insert( pair.first ); + if( pair.second != val ) { + std::cout << "unexpected entry ( " << pair.first << ", " << pair.second + << " ); expected value " << val << "\n"; + error = true; + } + } + if( grb::collectives<>::allreduce( count, semiring.getAdditiveOperator() ) + != grb::SUCCESS + ) { + std::cerr << "error during computing checksum (I)\n"; + out.error_code = grb::PANIC; + return false; + } + if( count != n ) { + std::cout << "expected " << n << " nonzeroes, got " << count << "\n"; + error = true; + } + count = localSet.size(); + if( grb::collectives<>::allreduce( count, semiring.getAdditiveOperator() ) + != grb::SUCCESS + ) { + std::cerr << "error during computing checksum (II)\n"; + out.error_code = grb::PANIC; + return false; + } + if( count != n ) { + std::cout << "expected " << n << " indices, got " << count << "\n"; + error = true; + } + if( out.error_code == grb::SUCCESS ) { + if( error ) { + out.error_code = grb::FAILED; + return false; + } + } + return true; +} + +// main label propagation algorithm +void grbProgram( const struct input &data_in, struct output &out ) { + out.error_code = grb::PANIC; + grb::utils::Timer timer; + + const size_t s = grb::spmd<>::pid(); +#ifndef NDEBUG + assert( s < grb::spmd<>::nprocs() ); +#else + (void) s; +#endif + + // get input n and test case + const size_t n = data_in.n; + + // setup + grb::Vector< double > x( n ); + grb::Matrix< double > A = grb::algorithms::matrices< double >::identity( n ); + Semiring< + grb::operators::add< double >, grb::operators::mul< double >, + grb::identities::zero, grb::identities::one + > semiring; + grb::operators::divide< double > division; + grb::operators::subtract< double > subtraction; + + out.error_code = grb::set( x, 3.14 ); + if( out.error_code ) { + std::cerr << "Error during test initialisation\n"; + return; + } + if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { + std::cerr << "NOTE: the aforementioned errors occurred during test " + << "initialisation\n"; + return; + } + out.times.preamble = timer.time(); + + if( s == 0 ) { + std::cout << "Test 1: "; + } + timer.reset(); + out.error_code = grb::sptrsv( x, A, false, semiring, subtraction, division ); + if( out.error_code ) { + std::cerr << "test returned error\n"; + } + if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { return; } + out.times.postamble += timer.time(); + timer.reset(); + for( size_t rep = 0; rep < data_in.rep; ++rep ) { + (void) grb::sptrsv( x, A, false, semiring, subtraction, division ); + } + out.times.useful += timer.time(); + + assert( out.error_code == grb::SUCCESS ); + if( s == 0 ) { + std::cout << "\b\b 2: "; + } + timer.reset(); + out.error_code = grb::set< grb::descriptors::dense >( x, 2.14 ); + if( out.error_code ) { + std::cerr << "Error during test initialisation\n"; + return; + } + if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { + std::cerr << "NOTE: aforementioned errors occurred during initialisation\n"; + return; + } + out.times.preamble += timer.time(); + timer.reset(); + out.error_code = grb::sptrsv( x, A, true, semiring, subtraction, division ); + if( out.error_code ) { + std::cerr << "test returned error\n"; + } + if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { return; } + out.times.postamble += timer.time(); + timer.reset(); + for( size_t rep = 0; rep < data_in.rep; ++rep ) { + (void) grb::sptrsv( x, A, true, semiring, subtraction, division ); + } + out.times.useful += timer.time(); + + assert( out.error_code == grb::SUCCESS ); + if( s == 0 ) { + std::cout << "\b\b 3: "; + } + timer.reset(); + out.error_code = grb::set< grb::descriptors::dense >( x, 3.14 ); + if( out.error_code ) { + std::cerr << "Error during test initialisation\n"; + return; + } + if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { + std::cerr << "NOTE: aforementioned errors occurred during initialisation\n"; + return; + } + out.times.preamble += timer.time(); + timer.reset(); + out.error_code = grb::sptrsv< grb::descriptors::dense >( x, A, false, semiring, + subtraction, division ); + if( out.error_code ) { + std::cerr << "test returned error\n"; + } + if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { return; } + out.times.postamble += timer.time(); + timer.reset(); + for( size_t rep = 0; rep < data_in.rep; ++rep ) { + (void) grb::sptrsv< grb::descriptors::dense >( x, A, false, semiring, + subtraction, division ); + } + out.times.useful += timer.time(); + + assert( out.error_code == grb::SUCCESS ); + if( s == 0 ) { + std::cout << "\b\b 4: "; + } + timer.reset(); + out.error_code = grb::set< grb::descriptors::dense >( x, 2.14 ); + if( out.error_code ) { + std::cerr << "Error during test initialisation\n"; + return; + } + if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { + std::cerr << "NOTE: aforementioned errors occurred during initialisation\n"; + return; + } + out.times.preamble += timer.time(); + timer.reset(); + out.error_code = grb::sptrsv< grb::descriptors::dense >( x, A, true, semiring, + subtraction, division ); + if( out.error_code ) { + std::cerr << "test returned error\n"; + } + if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { return; } + out.times.postamble += timer.time(); + timer.reset(); + for( size_t rep = 0; rep < data_in.rep; ++rep ) { + (void) grb::sptrsv< grb::descriptors::dense >( x, A, true, semiring, + subtraction, division ); + } + out.times.useful += timer.time(); + + std::cout << "OK\n"; +} + +// main function will execute in serial or as SPMD +int main( int argc, char ** argv ) { + // sanity check + if( argc < 3 || argc > 5 ) { + std::cout << "Usage: " << argv[ 0 ] << " " + << "(inner repititions) (outer repititions)" << std::endl; + return 0; + } + std::cout << "Test executable: " << argv[ 0 ] << std::endl; + + // the input struct + struct input in; + in.n = atoi( argv[ 1 ] ); + in.rep = grb::config::BENCHMARKING::inner(); + size_t outer = grb::config::BENCHMARKING::outer(); + char * end = NULL; + if( argc >= 3 ) { + in.rep = strtoumax( argv[ 2 ], &end, 10 ); + if( argv[ 2 ] == end ) { + std::cerr << "Could not parse argument for number of inner " + << "repetitions." << std::endl; + return 25; + } + } + if( argc >= 4 ) { + outer = strtoumax( argv[ 3 ], &end, 10 ); + if( argv[ 3 ] == end ) { + std::cerr << "Could not parse argument for number of outer " + << "reptitions." << std::endl; + return 30; + } + } + + std::cout << "Executable called with parameters: problem size " << in.n + << ", inner = " << in.rep << ", outer = " << outer << "." << std::endl; + + // the output struct + struct output out; + + // run the program one time to infer number of inner repititions + if( in.rep == 0 ) { + in.rep = 1; + grb::Launcher< AUTOMATIC > launcher; + const enum grb::RC rc = launcher.exec( &grbProgram, in, out, true ); + if( rc != SUCCESS ) { + std::cerr << "launcher.exec returns with non-SUCCESS error code " + << grb::toString( rc ) << std::endl; + return 40; + } + // set guesstimate for inner repititions: a single experiment should take at least a second + in.rep = static_cast< double >( 1000.0 / out.times.useful ) + 1; + std::cout << "Auto-selected number of inner repetitions is " + << in.rep << " (at an estimated time of " + << out.times.useful << " ms. of useful work per benchmark).\n"; + } + + // start benchmarks + grb::Benchmarker< AUTOMATIC > benchmarker; + const enum grb::RC rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true ); + if( rc != SUCCESS ) { + std::cerr << "launcher.exec returns with non-SUCCESS error code " + << grb::toString( rc ) << std::endl; + return 50; + } + + // done + if( out.error_code != SUCCESS ) { + std::cout << "Test FAILED\n" << std::endl; + std::cerr << std::flush; + return out.error_code; + } + std::cout << "Test OK\n" << std::endl; + return 0; +} + From 8b11500215f3352d12eb109b411fc1ddc7bc7996 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 17:26:25 +0200 Subject: [PATCH 07/55] Warn about experimental nature of new primitive --- include/graphblas/base/blas2.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp index 7e3274ea4..2d61d46d4 100644 --- a/include/graphblas/base/blas2.hpp +++ b/include/graphblas/base/blas2.hpp @@ -340,6 +340,8 @@ namespace grb { /** * Computes \f$ x \f$ from \f$ Tx=b \f$. * + * \warning This is an experimental feature. + * * Here, \f$ T, b \f$ are given while \f$ T \f$ additionally must be either * lower- or upper-triangular. The output \f$ x \f$ may furthermore be masked. * @@ -433,6 +435,8 @@ namespace grb { /** * Computes \f$ x \f$ from \f$ Tx = b \f$, unmasked variant. * + * \warning This is an experimental feature. + * * \todo Extend documentation */ template< From 2bb446b7e156d211bc8d7f7367d7623d6bbf0795 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 17:48:03 +0200 Subject: [PATCH 08/55] Fix cast error --- include/graphblas/reference/blas2.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 9b60c5d7d..9906f7471 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2232,9 +2232,9 @@ namespace grb { const Subtraction &subtraction ) { constexpr auto one = - Semiring::template One< typename Semiring::D2 >::value(); + Semiring::template One< typename Semiring::D1 >::value(); for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { - const typename Semiring::D2 val = crs.template getValue( k, one ); + const typename Semiring::D1 val = crs.template getValue( k, one ); const auto &ind = crs.row_index[ k ]; if( static_cast< size_t >(ind) == i ) { #ifdef _DEBUG From ef2c4e4587a0f64a2c39336261ed977ae2e5b7ab Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 18:20:11 +0200 Subject: [PATCH 09/55] Simplify code a bit --- include/graphblas/reference/blas2.hpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 9906f7471..bbaf93d1e 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2233,6 +2233,7 @@ namespace grb { ) { constexpr auto one = Semiring::template One< typename Semiring::D1 >::value(); + // TODO could do some ALP-style vectorisation here for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { const typename Semiring::D1 val = crs.template getValue( k, one ); const auto &ind = crs.row_index[ k ]; @@ -2260,10 +2261,10 @@ namespace grb { Descriptor descr, class Semiring, class Subtraction, class Division, typename IOType, typename InputType1, - typename Coords, typename RIT, typename CIT, typename NIT + typename RIT, typename CIT, typename NIT > RC dense_unmasked_sptrsv( - Vector< IOType, reference, Coords > &xb, + IOType *__restrict__ const v_raw, const Matrix< InputType1, reference, RIT, CIT, NIT > &T, const size_t &n, const bool forward, @@ -2273,8 +2274,6 @@ namespace grb { const Phase &phase ) { // dynamic sanity checks - assert( grb::size( xb ) == n ); - assert( grb::nnz( xb ) == n ); assert( grb::nrows( T ) == n ); assert( grb::ncols( T ) == n ); @@ -2284,9 +2283,6 @@ namespace grb { // only execute and resize are supported assert( phase == grb::EXECUTE ); - // get required data handles - IOType * const v_raw = internal::getRaw( xb ); - // switch forward or backward solve if( forward ) { const auto &crs = internal::getCRS( T ); @@ -2415,7 +2411,9 @@ namespace grb { // check dense dispatch if( dense || grb::nnz( xb ) == n ) { - return internal::dense_unmasked_sptrsv< descr >( xb, T, n, forward, semiring, + IOType * const xb_p = internal::getRaw( xb ); + return internal::dense_unmasked_sptrsv< descr >( + xb_p, T, n, forward, semiring, subtraction, division, phase ); } else { grb::Vector< IOType, reference, Coords > no_mask( 0 ); From ef8992170cb26c1cfdc30769c73bd468b9702905 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 18:20:24 +0200 Subject: [PATCH 10/55] Enable new sptrsv unit test --- tests/unit/unittests.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh index 0b49d2421..e6d3f7644 100755 --- a/tests/unit/unittests.sh +++ b/tests/unit/unittests.sh @@ -197,6 +197,19 @@ for MODE in ${MODES}; do grep 'Test OK' ${TEST_OUT_DIR}/mxm_crs_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" echo " " + echo ">>> [x] [ ] Testing grb::sptrsv (unmasked, dense vectors) on" + echo " small simple matrices, re-entrant" + $runner ${TEST_BIN_DIR}/dense_sptrsv_${MODE}_${BACKEND} 10 1000 3 &> ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log + head -1 ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log + grep 'Test OK' ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log || echo "Test FAILED" + echo " " + + echo ">>> [x] [ ] Testing grb::sptrsv (unmasked, dense vectors) on" + echo " large simple matrices, re-entrant" + $runner ${TEST_BIN_DIR}/dense_sptrsv_${MODE}_${BACKEND} 1000000 1 3 &> ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log + head -1 ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log + grep 'Test OK' ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log || echo "Test FAILED" + echo " " fi echo "#################################################################" From e28e1d42c06ea89157b2c268e6cf14e5a4a03ad4 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 20:44:55 +0200 Subject: [PATCH 11/55] Add parallel (OpenMP) sptrsv, which may take a schedule such as produced by OneStopParallel --- .../graphblas/reference/SptrsvSchedule.hpp | 118 ++++++++++++++++ include/graphblas/reference/blas2.hpp | 129 +++++++++++++++--- include/graphblas/reference/matrix.hpp | 34 ++++- 3 files changed, 258 insertions(+), 23 deletions(-) create mode 100644 include/graphblas/reference/SptrsvSchedule.hpp diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp new file mode 100644 index 000000000..86479c7d4 --- /dev/null +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -0,0 +1,118 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @author A. N. Yzelman + * @date 4th of April, 2025 + */ + +#ifndef _H_GRB_REFERENCE_SPTRSVSCHEDULE +#define _H_GRB_REFERENCE_SPTRSVSCHEDULE + +#include +#include + +#include + + +namespace grb { + + namespace internal { + + template< typename NIT > + class SptrsvSchedule { + + private: + + /** One deleter for each chunk of thread-local data. */ + std::vector< utils::AutoDeleter< char > > _deleters; + + /** Computes and initialises a trivial schedule. */ + void initTrivial( const NIT n ) { + assert( data[ 0 ] == nullptr ); + const grb::RC rc = utils::alloc( + "grb::internal::SptrsvSchedule (default constructor)", + "default thread-local data allocation", + data[ 0 ], 2 * sizeof(NIT), false, _deleters[ 0 ] + ); + if( rc != grb::SUCCESS ) { + throw std::bad_alloc(); + } + NIT * const interpreted = reinterpret_cast< NIT * >(data[0]); + interpreted[ 0 ] = 0; + interpreted[ 1 ] = n; + } + + + public: + + /** Number of schedule steps. */ + size_t supersteps; + + /** + * The number of threads the schedule is designed for. + * + * \note Uses int as per OpenMP spec. + */ + int nThreads; + + /** One data pointer per thread. */ + std::vector< char * > data; + + /** + * Base constructor. + * + * @param[in] n The size of the linear system. + * + * Initialises the schedule to the trivial always-valid schedule, which is + * a single-thread single-superstep schedule that assigns all work to the + * first (and only) thread. + */ + SptrsvSchedule( const NIT n ) : + _deleters( 1 ), supersteps( 1 ), nThreads( 1 ), data( 1 ) + { + data[ 0 ] = nullptr; + initTrivial( n ); + } + + /** + * Specialised constructor for multiple threads. + * + * @param[in] n The size of the linear system. + * @param[in] T The number of threads a schedule should be created for. + * + * Initialises the schedule to the trivial always-valid schedule, which is + * a single-superstep schedule that assigns all work to the first thread. + */ + SptrsvSchedule( const NIT n, const size_t T ) : + _deleters( T ), supersteps( 0 ), nThreads( T ), data( T ) + { + data[ 0 ] = nullptr; + if( T == 0 || T > std::numeric_limits< int >::max() ) { + throw std::runtime_error( "Invalid number of threads" ); + } + initTrivial( n ); + } + + }; + + } // end namespace `grb::internal' + +} // end namespace `grb' + +#endif // _H_GRB_REFERENCE_SPTRSVSCHEDULE + diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index bbaf93d1e..e79c3a80c 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2256,17 +2256,17 @@ namespace grb { } #endif - /** \internal Specialised dense unmasked sptrsv implementation */ + /** \internal Specialised dense unmasked sptrsv implementation, sequential */ template< - Descriptor descr, + Descriptor descr, bool maybe_offset, class Semiring, class Subtraction, class Division, typename IOType, typename InputType1, typename RIT, typename CIT, typename NIT > - RC dense_unmasked_sptrsv( + RC dense_unmasked_sequential_sptrsv( IOType *__restrict__ const v_raw, const Matrix< InputType1, reference, RIT, CIT, NIT > &T, - const size_t &n, + const size_t offset, const size_t &n, const bool forward, const Semiring &semiring, const Subtraction &subtraction, @@ -2286,27 +2286,53 @@ namespace grb { // switch forward or backward solve if( forward ) { const auto &crs = internal::getCRS( T ); - for( size_t i = 0; i < n; ++i ) { - IOType divBy = semiring.template getZero< IOType >(); - assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); - sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction ); + if( maybe_offset ) { + for( size_t i = 0; i < n; ++i ) { + IOType divBy = semiring.template getZero< IOType >(); + assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); + sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction ); #ifdef _DEBUG - std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy - << "\n"; + std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; #endif - (void) grb::foldl( v_raw[ i ], divBy, division ); + (void) grb::foldl( v_raw[ i ], divBy, division ); + } + } else { + for( size_t i = offset; i < n; ++i ) { + IOType divBy = semiring.template getZero< IOType >(); + assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); + sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction ); +#ifdef _DEBUG + std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; +#endif + (void) grb::foldl( v_raw[ i ], divBy, division ); + } } } else { const auto &ccs = internal::getCCS( T ); - for( size_t i = n - 1; i < n; --i ) { - IOType divBy = semiring.template getZero< IOType >(); - assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); - sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction ); + if( !maybe_offset || offset == 0 ) { + for( size_t i = n - 1; i < n; --i ) { + IOType divBy = semiring.template getZero< IOType >(); + assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); + sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction ); #ifdef _DEBUG - std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy - << "\n"; + std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; #endif - (void) grb::foldl( v_raw[ i ], divBy, division ); + (void) grb::foldl( v_raw[ i ], divBy, division ); + } + } else { + for( size_t i = n - 1 + offset; i >= offset; --i ) { + IOType divBy = semiring.template getZero< IOType >(); + assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); + sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction ); +#ifdef _DEBUG + std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; +#endif + (void) grb::foldl( v_raw[ i ], divBy, division ); + } } } @@ -2314,6 +2340,63 @@ namespace grb { return grb::SUCCESS; } +#ifdef _H_GRB_REFERENCE_OMP_BLAS2 + /** \internal Specialised dense unmasked sptrsv implementation, OpenMP */ + template< + Descriptor descr, + class Semiring, class Subtraction, class Division, + typename IOType, typename InputType1, + typename RIT, typename CIT, typename NIT + > + RC dense_unmasked_omp_sptrsv( + IOType *__restrict__ const v_raw, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const size_t &n, + const bool forward, + const Semiring &semiring, + const Subtraction &subtraction, + const Division &division, + const Phase &phase + ) { + // dynamic sanity checks + assert( grb::nrows( T ) == n ); + assert( grb::ncols( T ) == n ); + + // in dense unmasked, resize is a no-op + if( phase == grb::RESIZE ) { return grb::SUCCESS; } + + // get SpTrsv schedule + const SptrsvSchedule< NIT > *sptrsvSchedule_p = + internal::getSptrsvData( T ); + const SptrsvSchedule< NIT > &sptrsv = sptrsvSchedule_p == nullptr ? + SptrsvSchedule< NIT >( n ) : *sptrsvSchedule_p; + + // only execute and resize are supported + assert( phase == grb::EXECUTE ); + + // start parallel section + grb::RC ret = grb::SUCCESS; + #pragma omp parallel num_threads(sptrsv.nThreads) + { + const int s = omp_get_thread_num(); + const NIT *__restrict__ data = + reinterpret_cast< const NIT * >(sptrsv.data[ s ]); + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + const size_t lo = static_cast< size_t >( *data++ ); + const size_t no = static_cast< size_t >( *data++ ); + assert( lo < n ); + assert( no + lo <= n ); + ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >( + v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); + #pragma omp barrier + } + } + + // done + return ret; + } +#endif + /** * \internal Implements sparse masked, sparse unmasked, and dense masked * sptrsv. @@ -2412,9 +2495,13 @@ namespace grb { // check dense dispatch if( dense || grb::nnz( xb ) == n ) { IOType * const xb_p = internal::getRaw( xb ); - return internal::dense_unmasked_sptrsv< descr >( - xb_p, T, n, forward, semiring, - subtraction, division, phase ); +#ifndef _H_GRB_REFERENCE_OMP_BLAS2 + return internal::dense_unmasked_sequential_sptrsv< descr, false >( + xb_p, T, 0, n, forward, semiring, subtraction, division, phase ); +#else + return internal::dense_unmasked_omp_sptrsv< descr >( + xb_p, T, n, forward, semiring, subtraction, division, phase ); +#endif } else { grb::Vector< IOType, reference, Coords > no_mask( 0 ); return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T, n, diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index c1b3a365d..dfad4d6f7 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -49,6 +49,7 @@ #include #include +#include #include #include "NonzeroWrapper.hpp" @@ -237,6 +238,18 @@ namespace grb { return A.valbuf[ 1 ]; } + /** + * \internal + * Retrieves an optimised sptrsv-specific schedule. + * \endinternal + */ + template< typename InputType, typename RIT, typename CIT, typename NIT > + const internal::SptrsvSchedule< NIT > * getSptrsvData( + const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A + ) noexcept { + return A.sptrsvSchedule; + } + template< Descriptor descr, bool input_dense, bool output_dense, @@ -1225,6 +1238,11 @@ namespace grb { const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A ) noexcept; + template< typename InputType, typename RIT, typename CIT, typename NIT > + friend const internal::SptrsvSchedule< NIT > * internal::getSptrsvData( + const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A + ) noexcept; + friend const grb::Matrix< D, reference, ColIndexType, ColIndexType, NonzeroIndexType @@ -1351,13 +1369,19 @@ namespace grb { */ utils::AutoDeleter< char > _local_deleter[ 6 ]; + /** + * Optimised schedule for sptrsv operations on this matrix. + */ + internal::SptrsvSchedule< NonzeroIndexType > * sptrsvSchedule; + /** * Internal constructor for manual construction of matrices. * * Should be followed by a manual call to #initialize. */ Matrix() : id( std::numeric_limits< uintptr_t >::max() ), - remove_id( false ), m( 0 ), n( 0 ), cap( 0 ), nz( 0 ) + remove_id( false ), m( 0 ), n( 0 ), cap( 0 ), nz( 0 ), + sptrsvSchedule( nullptr ) {} /** @@ -1403,7 +1427,7 @@ namespace grb { id( std::numeric_limits< uintptr_t >::max() ), remove_id( false ), m( _m ), n( _n ), cap( _cap ), nz( _offset_array[ _m ] ), coorArr{ nullptr, buf1 }, coorBuf{ nullptr, buf2 }, - valbuf{ nullptr, buf3 } + valbuf{ nullptr, buf3 }, sptrsvSchedule( nullptr ) { assert( (_m > 0 && _n > 0) || _column_indices[ 0 ] == 0 ); CRS.replace( _values, _column_indices ); @@ -1596,6 +1620,8 @@ namespace grb { _deleter[ i ] = std::move( other._deleter[ i ] ); _local_deleter[ i ] = std::move( other._local_deleter[ i ] ); } + if( sptrsvSchedule ) { delete sptrsvSchedule; } + sptrsvSchedule = other.sptrsvSchedule; // invalidate other fields for( unsigned int i = 0; i < 2; ++i ) { @@ -1608,6 +1634,7 @@ namespace grb { other.n = 0; other.cap = 0; other.nz = 0; + other.sptrsvSchedule = nullptr; } /** @@ -2228,6 +2255,9 @@ namespace grb { #endif internal::reference_mapper.remove( id ); } + if( sptrsvSchedule ) { + delete sptrsvSchedule; + } } /** From 20ea13fdae8ebc659ab73238466c8186890c43da Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 21:00:43 +0200 Subject: [PATCH 12/55] Delete from move constructor should not occur --- include/graphblas/reference/matrix.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index dfad4d6f7..fc02e001b 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -1448,6 +1448,7 @@ namespace grb { const size_t rows, const size_t cols, const size_t cap_in ) { + sptrsvSchedule = nullptr; #ifdef _DEBUG_REFERENCE_MATRIX std::cerr << "\t in Matrix< reference >::initialize...\n" << "\t\t matrix size " << rows << " by " << cols << "\n" @@ -1620,7 +1621,6 @@ namespace grb { _deleter[ i ] = std::move( other._deleter[ i ] ); _local_deleter[ i ] = std::move( other._local_deleter[ i ] ); } - if( sptrsvSchedule ) { delete sptrsvSchedule; } sptrsvSchedule = other.sptrsvSchedule; // invalidate other fields From 45b9f46e4730df1042d7b9bf52a7e9a84e57be8a Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 4 Apr 2025 21:04:30 +0200 Subject: [PATCH 13/55] If we set sptrsvSchedule to default in initialize(...), then we do not need to do so in the default constructor --- include/graphblas/reference/matrix.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index fc02e001b..244d7599d 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -1380,8 +1380,7 @@ namespace grb { * Should be followed by a manual call to #initialize. */ Matrix() : id( std::numeric_limits< uintptr_t >::max() ), - remove_id( false ), m( 0 ), n( 0 ), cap( 0 ), nz( 0 ), - sptrsvSchedule( nullptr ) + remove_id( false ), m( 0 ), n( 0 ), cap( 0 ), nz( 0 ) {} /** @@ -1448,6 +1447,7 @@ namespace grb { const size_t rows, const size_t cols, const size_t cap_in ) { + // SpTrsvSchedule should be manually set always by a requested tuning phase sptrsvSchedule = nullptr; #ifdef _DEBUG_REFERENCE_MATRIX std::cerr << "\t in Matrix< reference >::initialize...\n" From 2413a524ca5bae29a317b77b9846476c90774a8d Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Mon, 7 Apr 2025 10:29:39 +0200 Subject: [PATCH 14/55] Coding blind for a sec here, apologies -- will test it once I have my machine back --- .../graphblas/reference/SptrsvSchedule.hpp | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index 86479c7d4..ea7d184ab 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -44,19 +44,28 @@ namespace grb { /** Computes and initialises a trivial schedule. */ void initTrivial( const NIT n ) { assert( data[ 0 ] == nullptr ); + data[ 0 ] = &(default_schedule[0]); + NIT * const interpreted = reinterpret_cast< NIT * >(data[0]); + interpreted[ 0 ] = 0; + interpreted[ 1 ] = n; + } + + /** Allocates a thread-local chunk of data. */ + void alloc( const size_t s ) { + assert( data[ s ] == nullptr ); const grb::RC rc = utils::alloc( "grb::internal::SptrsvSchedule (default constructor)", "default thread-local data allocation", - data[ 0 ], 2 * sizeof(NIT), false, _deleters[ 0 ] + data[ s ], supersteps * sizeof(NIT), false, _deleters[ s ] ); if( rc != grb::SUCCESS ) { throw std::bad_alloc(); } - NIT * const interpreted = reinterpret_cast< NIT * >(data[0]); - interpreted[ 0 ] = 0; - interpreted[ 1 ] = n; } + /** Fixed-size buffer for realising the default schedule. */ + NIT default_schedule[2]; + public: @@ -99,7 +108,7 @@ namespace grb { * a single-superstep schedule that assigns all work to the first thread. */ SptrsvSchedule( const NIT n, const size_t T ) : - _deleters( T ), supersteps( 0 ), nThreads( T ), data( T ) + _deleters( T ), supersteps( 1 ), nThreads( T ), data( T ) { data[ 0 ] = nullptr; if( T == 0 || T > std::numeric_limits< int >::max() ) { @@ -108,6 +117,19 @@ namespace grb { initTrivial( n ); } + /** + * Base destructor. + * + * Dynamic memory is freed through the #_deleters. + */ + ~SptrsvSchedule() { + // sanity checks only + assert( nThreads != 0 ); + if( nThreads == 1 ) { + assert( supersteps == 1 ); + } + } + }; } // end namespace `grb::internal' From 32905e538cd5bdc3444933851c4600ffc4e08d2c Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Mon, 7 Apr 2025 12:54:25 +0200 Subject: [PATCH 15/55] Fix compile error and some more defensive programming --- include/graphblas/reference/SptrsvSchedule.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index ea7d184ab..06dc431dc 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -44,14 +44,22 @@ namespace grb { /** Computes and initialises a trivial schedule. */ void initTrivial( const NIT n ) { assert( data[ 0 ] == nullptr ); - data[ 0 ] = &(default_schedule[0]); + data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) ); NIT * const interpreted = reinterpret_cast< NIT * >(data[0]); interpreted[ 0 ] = 0; interpreted[ 1 ] = n; } - /** Allocates a thread-local chunk of data. */ + /** + * Allocates a thread-local chunk of data. + * + * Must be called from within the thread that will use it(!) + */ void alloc( const size_t s ) { + assert( s < nThreads ); + assert( supersteps > 0 ); + assert( _deleters.size() >= s ); + assert( data.size() >= s ); assert( data[ s ] == nullptr ); const grb::RC rc = utils::alloc( "grb::internal::SptrsvSchedule (default constructor)", From 98206d0a52260d4f8593bf847513073e56e6eb2b Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Mon, 7 Apr 2025 18:26:28 +0200 Subject: [PATCH 16/55] Add internal functions to set an Sptrsv schedule. Next up: test the performance of the sptrsv, in particular over multiple NUMA domains --- .../graphblas/reference/SptrsvSchedule.hpp | 46 +++++---- include/graphblas/reference/matrix.hpp | 97 +++++++++++++++++++ 2 files changed, 119 insertions(+), 24 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index 06dc431dc..2e02b95f9 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -50,27 +50,6 @@ namespace grb { interpreted[ 1 ] = n; } - /** - * Allocates a thread-local chunk of data. - * - * Must be called from within the thread that will use it(!) - */ - void alloc( const size_t s ) { - assert( s < nThreads ); - assert( supersteps > 0 ); - assert( _deleters.size() >= s ); - assert( data.size() >= s ); - assert( data[ s ] == nullptr ); - const grb::RC rc = utils::alloc( - "grb::internal::SptrsvSchedule (default constructor)", - "default thread-local data allocation", - data[ s ], supersteps * sizeof(NIT), false, _deleters[ s ] - ); - if( rc != grb::SUCCESS ) { - throw std::bad_alloc(); - } - } - /** Fixed-size buffer for realising the default schedule. */ NIT default_schedule[2]; @@ -82,10 +61,8 @@ namespace grb { /** * The number of threads the schedule is designed for. - * - * \note Uses int as per OpenMP spec. */ - int nThreads; + size_t nThreads; /** One data pointer per thread. */ std::vector< char * > data; @@ -138,6 +115,27 @@ namespace grb { } } + /** + * Allocates a thread-local chunk of data. + * + * Must be called from within the thread that will use it(!) + */ + void alloc( const size_t s ) { + assert( s < nThreads ); + assert( supersteps > 0 ); + assert( _deleters.size() >= s ); + assert( data.size() >= s ); + assert( data[ s ] == nullptr ); + const grb::RC rc = utils::alloc( + "grb::internal::SptrsvSchedule (default constructor)", + "default thread-local data allocation", + data[ s ], supersteps * sizeof(NIT), false, _deleters[ s ] + ); + if( rc != grb::SUCCESS ) { + throw std::bad_alloc(); + } + } + }; } // end namespace `grb::internal' diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 244d7599d..461cd9975 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -250,6 +250,85 @@ namespace grb { return A.sptrsvSchedule; } + template< + typename InputType, typename RIT, typename CIT, typename NIT + > + void allocateSptrsvSchedule( + grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, + const size_t nSteps, const size_t nThreads + ) { + if( A.sptrsvSchedule == nullptr ) { + throw std::runtime_error( "SptrsvSchedule was not initialised" ); + } + auto &sptrsv = *(A.sptrsvSchedule); + if( sptrsv.nThreads != nThreads ) { + throw std::runtime_error( "SptrsvSchedule was allocated with different " + "nThreads" ); + } + sptrsv.supersteps = nSteps; + + #pragma omp parallel + { + const size_t actualNumThreads = omp_get_num_threads(); + if( actualNumThreads != nThreads ) { + throw std::runtime_error( "Unexpected number of threads" ); + } + const size_t s = omp_get_thread_num(); + if( sptrsv.data[ s ] ) { + throw std::runtime_error( "A thread-local schedule already existed" ); + } + sptrsv.alloc( s ); + } + } + + template< + typename InputType, typename RIT, typename CIT, typename NIT, + typename LoIt, typename HiIt + > + void setSptrsvSchedule( + grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, + LoIt lo, const LoIt &lo_end, HiIt hi, const HiIt &hi_end, + const size_t nThreads + ) { + if( A.sptrsvSchedule == nullptr ) { + throw std::runtime_error( "SptrsvSchedule was not initialised" ); + } + auto &sptrsv = *(A.sptrsvSchedule); + if( sptrsv.nThreads != nThreads ) { + throw std::runtime_error( "SptrsvSchedule was allocated with different " + "nThreads" ); + } + + #pragma omp parallel + { + const size_t actualNumThreads = omp_get_num_threads(); + if( actualNumThreads != nThreads ) { + throw std::runtime_error( "Unexpected number of threads" ); + } + const size_t s = omp_get_thread_num(); + NIT *__restrict__ const array = reinterpret_cast< NIT * >(sptrsv.data[ s ]); + assert( lo != lo_end ); + assert( hi != hi_end ); + size_t count = 0; + do { + const NIT l = *lo++; + const NIT h = *hi++; + assert( h >= l ); + const NIT n = h - l; + if( count >= sptrsv.supersteps ) { + throw std::runtime_error( "Too many supersteps" ); + } + *array++ = l; + *array++ = n; + (void) count++; + if( lo == lo_end ) { assert( hi == hi_end ); } + } while( lo != lo_end && hi != hi_end ); + if( count != sptrsv.supersteps ) { + throw std::runtime_error( "Unexpected number of supersteps" ); + } + } + } + template< Descriptor descr, bool input_dense, bool output_dense, @@ -1243,6 +1322,24 @@ namespace grb { const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A ) noexcept; + template< + typename InputType, typename RIT, typename CIT, typename NIT + > + friend void internal::allocateSptrsvSchedule( + grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, + const size_t nSteps, const size_t nThreads + ); + + template< + typename InputType, typename RIT, typename CIT, typename NIT, + typename LoIt, typename HiIt + > + friend void internal::setSptrsvSchedule( + grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, + LoIt lo, const LoIt &lo_end, HiIt hi, const HiIt &hi_end, + const size_t nThreads + ); + friend const grb::Matrix< D, reference, ColIndexType, ColIndexType, NonzeroIndexType From 42773ca5a265bab0a5e182f8dc13c813e17db861 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Tue, 8 Apr 2025 16:48:33 +0200 Subject: [PATCH 17/55] OSP integration error: input schedule has slightly different structure --- include/graphblas/reference/matrix.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 461cd9975..72e077eff 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -311,8 +311,8 @@ namespace grb { assert( hi != hi_end ); size_t count = 0; do { - const NIT l = *lo++; - const NIT h = *hi++; + const NIT l = (*lo++)[s]; + const NIT h = (*hi++)[s]; assert( h >= l ); const NIT n = h - l; if( count >= sptrsv.supersteps ) { From b680b6e5550d770f9bb4e069ffb1e65b0b3ba964 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Tue, 8 Apr 2025 16:59:21 +0200 Subject: [PATCH 18/55] I still did not get it, but this should now be correct --- include/graphblas/reference/matrix.hpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 72e077eff..e3898d3d0 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -282,12 +282,11 @@ namespace grb { } template< - typename InputType, typename RIT, typename CIT, typename NIT, - typename LoIt, typename HiIt + typename InputType, typename RIT, typename CIT, typename NIT, typename It > void setSptrsvSchedule( grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, - LoIt lo, const LoIt &lo_end, HiIt hi, const HiIt &hi_end, + It bounds, const It &bounds_end, const size_t nThreads ) { if( A.sptrsvSchedule == nullptr ) { @@ -307,12 +306,11 @@ namespace grb { } const size_t s = omp_get_thread_num(); NIT *__restrict__ const array = reinterpret_cast< NIT * >(sptrsv.data[ s ]); - assert( lo != lo_end ); - assert( hi != hi_end ); + assert( bounds != bounds_end ); size_t count = 0; do { - const NIT l = (*lo++)[s]; - const NIT h = (*hi++)[s]; + const NIT l = (*bounds)[s][0]; + const NIT h = (*bounds)[s][1]; assert( h >= l ); const NIT n = h - l; if( count >= sptrsv.supersteps ) { @@ -321,8 +319,8 @@ namespace grb { *array++ = l; *array++ = n; (void) count++; - if( lo == lo_end ) { assert( hi == hi_end ); } - } while( lo != lo_end && hi != hi_end ); + (void) bounds++; + } while( bounds != bounds_end ); if( count != sptrsv.supersteps ) { throw std::runtime_error( "Unexpected number of supersteps" ); } From 6f6cdecaeb35b013b2b381681ccd4cfb05443ac0 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Tue, 8 Apr 2025 17:05:09 +0200 Subject: [PATCH 19/55] Forgot to update friend signature --- include/graphblas/reference/matrix.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index e3898d3d0..73cad8525 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -1329,12 +1329,11 @@ namespace grb { ); template< - typename InputType, typename RIT, typename CIT, typename NIT, - typename LoIt, typename HiIt + typename InputType, typename RIT, typename CIT, typename NIT, typename It > friend void internal::setSptrsvSchedule( grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, - LoIt lo, const LoIt &lo_end, HiIt hi, const HiIt &hi_end, + It bounds, const It &bounds_end, const size_t nThreads ); From ddf70bf707afa3c3da1bd4d9a2b61fb216e0d59b Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Tue, 8 Apr 2025 17:07:11 +0200 Subject: [PATCH 20/55] Bugfix --- include/graphblas/reference/matrix.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 73cad8525..7b7c43098 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -305,7 +305,8 @@ namespace grb { throw std::runtime_error( "Unexpected number of threads" ); } const size_t s = omp_get_thread_num(); - NIT *__restrict__ const array = reinterpret_cast< NIT * >(sptrsv.data[ s ]); + // get buffer as an array of NIT, which we will write to in one pass + NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]); assert( bounds != bounds_end ); size_t count = 0; do { From 18e79268b4a4a31943890eb81fcd33c581992976 Mon Sep 17 00:00:00 2001 From: Christos Konstantinos Matzoros Date: Wed, 9 Apr 2025 19:04:32 +0200 Subject: [PATCH 21/55] Bug Fixes --- .../graphblas/reference/SptrsvSchedule.hpp | 25 ++++++++++++++++--- include/graphblas/reference/blas2.hpp | 3 ++- include/graphblas/reference/matrix.hpp | 22 +++++++++------- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index 2e02b95f9..e4d8426b9 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -53,6 +53,16 @@ namespace grb { /** Fixed-size buffer for realising the default schedule. */ NIT default_schedule[2]; + void moveImpl( SptrsvSchedule && toMove ) { + _deleters = std::move( toMove._deleters ); + default_schedule = std::move( toMove.default_schedule ); + supersteps = toMove.supersteps; + nThreads = toMove.nThreads; + data = std::move( toMove.data ); + toMove.supersteps = 0; + toMove.nThreads = 0; + } + public: @@ -67,6 +77,15 @@ namespace grb { /** One data pointer per thread. */ std::vector< char * > data; + SptrsvSchedule( SptrsvSchedule && toMove ) { + moveImpl( toMove ); + } + + SptrsvSchedule& operator=( SptrsvSchedule &&toMove ) { + moveImpl( toMove ); + return *this; + } + /** * Base constructor. * @@ -93,9 +112,9 @@ namespace grb { * a single-superstep schedule that assigns all work to the first thread. */ SptrsvSchedule( const NIT n, const size_t T ) : - _deleters( T ), supersteps( 1 ), nThreads( T ), data( T ) + _deleters( T ), supersteps( 1 ), nThreads( T ), data( T , nullptr ) { - data[ 0 ] = nullptr; + //data[ 0 ] = nullptr; if( T == 0 || T > std::numeric_limits< int >::max() ) { throw std::runtime_error( "Invalid number of threads" ); } @@ -129,7 +148,7 @@ namespace grb { const grb::RC rc = utils::alloc( "grb::internal::SptrsvSchedule (default constructor)", "default thread-local data allocation", - data[ s ], supersteps * sizeof(NIT), false, _deleters[ s ] + data[ s ], 2 * supersteps * sizeof(NIT), false, _deleters[ s ] ); if( rc != grb::SUCCESS ) { throw std::bad_alloc(); diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index e79c3a80c..7c7254598 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2368,8 +2368,9 @@ namespace grb { // get SpTrsv schedule const SptrsvSchedule< NIT > *sptrsvSchedule_p = internal::getSptrsvData( T ); + const SptrsvSchedule< NIT > default_schedule (n); const SptrsvSchedule< NIT > &sptrsv = sptrsvSchedule_p == nullptr ? - SptrsvSchedule< NIT >( n ) : *sptrsvSchedule_p; + default_schedule : *sptrsvSchedule_p; // only execute and resize are supported assert( phase == grb::EXECUTE ); diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 7b7c43098..7be64e203 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -255,22 +255,25 @@ namespace grb { > void allocateSptrsvSchedule( grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, - const size_t nSteps, const size_t nThreads + const size_t nSteps, const NIT n, const size_t nThreads ) { - if( A.sptrsvSchedule == nullptr ) { - throw std::runtime_error( "SptrsvSchedule was not initialised" ); + if( A.sptrsvSchedule != nullptr ) { + throw std::runtime_error( "SptrsvSchedule was already initialised" ); } + A.sptrsvSchedule = new SptrsvSchedule(n, nThreads); + auto &sptrsv = *(A.sptrsvSchedule); - if( sptrsv.nThreads != nThreads ) { - throw std::runtime_error( "SptrsvSchedule was allocated with different " - "nThreads" ); - } + sptrsv.supersteps = nSteps; + // disable default trivial schedule + sptrsv.data[ 0 ] = nullptr; + #pragma omp parallel { const size_t actualNumThreads = omp_get_num_threads(); - if( actualNumThreads != nThreads ) { + + if( actualNumThreads < nThreads ) { throw std::runtime_error( "Unexpected number of threads" ); } const size_t s = omp_get_thread_num(); @@ -306,6 +309,7 @@ namespace grb { } const size_t s = omp_get_thread_num(); // get buffer as an array of NIT, which we will write to in one pass + assert( sptrsv.data[ s ] != nullptr ); NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]); assert( bounds != bounds_end ); size_t count = 0; @@ -1326,7 +1330,7 @@ namespace grb { > friend void internal::allocateSptrsvSchedule( grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, - const size_t nSteps, const size_t nThreads + const size_t nSteps, const NIT n, const size_t nThreads ); template< From 6617cada9e39509dbe97bb60611c6d2e08beb3db Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 12:23:07 +0200 Subject: [PATCH 22/55] There can be multiple ranges per superstep per thread that OSP returns -- create space to store how many ranges per superstep per thread there are --- .../graphblas/reference/SptrsvSchedule.hpp | 69 ++++++++++++++----- include/graphblas/reference/matrix.hpp | 16 +++-- 2 files changed, 64 insertions(+), 21 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index e4d8426b9..322cc601e 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -41,24 +41,50 @@ namespace grb { /** One deleter for each chunk of thread-local data. */ std::vector< utils::AutoDeleter< char > > _deleters; + /** + * Fixed-size buffer for realising the default schedule. + * + * The first two positions are reserved for the range of the default + * schedule, while the last position is reserved for the end-position of + * the default schedule. + */ + NIT default_schedule[ 3 ]; + /** Computes and initialises a trivial schedule. */ void initTrivial( const NIT n ) { + // dynamic sanity check assert( data[ 0 ] == nullptr ); + // set trivial schedule + default_schedule[ 0 ] = 0; + default_schedule[ 1 ] = n; + default_schedule[ 2 ] = 1; + // set pointers to trivial schedule data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) ); - NIT * const interpreted = reinterpret_cast< NIT * >(data[0]); - interpreted[ 0 ] = 0; - interpreted[ 1 ] = n; + endPostions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) ); + // verify trivial schedule +#ifndef NDEBUG + { + NIT * const interpreted = reinterpret_cast< NIT * >(data[0]); + assert( interpreted[ 0 ] == 0 ); + assert( interpreted[ 1 ] == n ); + } + { + NIT * const interpreted = reinterpret_cast< NIT * >(endPositions[0]); + assert( *interpreted == 1 ); + } +#endif } - /** Fixed-size buffer for realising the default schedule. */ - NIT default_schedule[2]; - - void moveImpl( SptrsvSchedule && toMove ) { + /** + * Implementation of move construction and assignment. + */ + void moveImpl( SptrsvSchedule &&toMove ) { _deleters = std::move( toMove._deleters ); default_schedule = std::move( toMove.default_schedule ); supersteps = toMove.supersteps; nThreads = toMove.nThreads; data = std::move( toMove.data ); + endPositions = std::move( toMove.endPositions ); toMove.supersteps = 0; toMove.nThreads = 0; } @@ -77,13 +103,12 @@ namespace grb { /** One data pointer per thread. */ std::vector< char * > data; - SptrsvSchedule( SptrsvSchedule && toMove ) { - moveImpl( toMove ); - } + /** One end-position array per thread. */ + std::vector< char * > endPostions; - SptrsvSchedule& operator=( SptrsvSchedule &&toMove ) { + /** Move constructor. */ + SptrsvSchedule( SptrsvSchedule &&toMove ) { moveImpl( toMove ); - return *this; } /** @@ -96,9 +121,11 @@ namespace grb { * first (and only) thread. */ SptrsvSchedule( const NIT n ) : - _deleters( 1 ), supersteps( 1 ), nThreads( 1 ), data( 1 ) + _deleters( 1 ), supersteps( 1 ), nThreads( 1 ), + data( 1 ), endPositions( 1 ) { data[ 0 ] = nullptr; + endPositions[ 0 ] = nullptr; initTrivial( n ); } @@ -112,9 +139,9 @@ namespace grb { * a single-superstep schedule that assigns all work to the first thread. */ SptrsvSchedule( const NIT n, const size_t T ) : - _deleters( T ), supersteps( 1 ), nThreads( T ), data( T , nullptr ) + _deleters( 2 * T ), supersteps( 1 ), nThreads( T ), + data( T, nullptr ), endPositions( T, nullptr ) { - //data[ 0 ] = nullptr; if( T == 0 || T > std::numeric_limits< int >::max() ) { throw std::runtime_error( "Invalid number of threads" ); } @@ -134,21 +161,29 @@ namespace grb { } } + /** Move assignment. */ + SptrsvSchedule& operator=( SptrsvSchedule &&toMove ) { + moveImpl( toMove ); + return *this; + } + /** * Allocates a thread-local chunk of data. * * Must be called from within the thread that will use it(!) */ - void alloc( const size_t s ) { + void alloc( const size_t s, const size_t nRanges ) { assert( s < nThreads ); assert( supersteps > 0 ); assert( _deleters.size() >= s ); assert( data.size() >= s ); assert( data[ s ] == nullptr ); + assert( endPositions[ s ] == nullptr ); const grb::RC rc = utils::alloc( "grb::internal::SptrsvSchedule (default constructor)", "default thread-local data allocation", - data[ s ], 2 * supersteps * sizeof(NIT), false, _deleters[ s ] + data[ s ], 2 * nRanges * sizeof(NIT), false, _deleters[ s ], + endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ] ); if( rc != grb::SUCCESS ) { throw std::bad_alloc(); diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 7be64e203..606a19285 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -251,11 +251,13 @@ namespace grb { } template< - typename InputType, typename RIT, typename CIT, typename NIT + typename InputType, typename RIT, typename CIT, typename NIT, + typename NumRangesIt > void allocateSptrsvSchedule( grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, - const size_t nSteps, const NIT n, const size_t nThreads + const NIT n, const size_t nThreads, const size_t nSteps, + const NumRangesIt nRanges, const NumRangesIt nRanges_end ) { if( A.sptrsvSchedule != nullptr ) { throw std::runtime_error( "SptrsvSchedule was already initialised" ); @@ -268,19 +270,25 @@ namespace grb { // disable default trivial schedule sptrsv.data[ 0 ] = nullptr; + sptrsv.endPositions[ 0 ] = nullptr; #pragma omp parallel { + auto localIt = nRanges; + std::advance( localIt, s ); const size_t actualNumThreads = omp_get_num_threads(); if( actualNumThreads < nThreads ) { throw std::runtime_error( "Unexpected number of threads" ); } + if( *localIt <= 0 ) { + throw std::runtime_error( "Unexpected number of ranges" ); + } const size_t s = omp_get_thread_num(); - if( sptrsv.data[ s ] ) { + if( sptrsv.data[ s ] || sptrsv.endPositions[ s ] ) { throw std::runtime_error( "A thread-local schedule already existed" ); } - sptrsv.alloc( s ); + sptrsv.alloc( s, *localIt ); } } From fba4d082269263d2d79848653dcab916ad0f5e9f Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 12:31:26 +0200 Subject: [PATCH 23/55] Bugfixes in previous commit, apologies --- include/graphblas/reference/SptrsvSchedule.hpp | 4 ++-- include/graphblas/reference/matrix.hpp | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index 322cc601e..fc9fe8b7b 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -60,7 +60,7 @@ namespace grb { default_schedule[ 2 ] = 1; // set pointers to trivial schedule data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) ); - endPostions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) ); + endPositions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) ); // verify trivial schedule #ifndef NDEBUG { @@ -104,7 +104,7 @@ namespace grb { std::vector< char * > data; /** One end-position array per thread. */ - std::vector< char * > endPostions; + std::vector< char * > endPositions; /** Move constructor. */ SptrsvSchedule( SptrsvSchedule &&toMove ) { diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 606a19285..72db3fb5e 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -275,16 +275,15 @@ namespace grb { #pragma omp parallel { auto localIt = nRanges; - std::advance( localIt, s ); + const size_t s = omp_get_thread_num(); const size_t actualNumThreads = omp_get_num_threads(); - if( actualNumThreads < nThreads ) { throw std::runtime_error( "Unexpected number of threads" ); } + std::advance( localIt, s ); if( *localIt <= 0 ) { throw std::runtime_error( "Unexpected number of ranges" ); } - const size_t s = omp_get_thread_num(); if( sptrsv.data[ s ] || sptrsv.endPositions[ s ] ) { throw std::runtime_error( "A thread-local schedule already existed" ); } From 2dfa32919f5f5dc4c9bc9620200f26dd73bfc85f Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 12:36:43 +0200 Subject: [PATCH 24/55] Update sptrsv implementation to make use of new schedule field. Tests OK --- include/graphblas/reference/blas2.hpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 7c7254598..473c0c8f3 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2382,13 +2382,17 @@ namespace grb { const int s = omp_get_thread_num(); const NIT *__restrict__ data = reinterpret_cast< const NIT * >(sptrsv.data[ s ]); + const NIT *__restrict__ const end = + reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]); for( size_t i = 0; i < sptrsv.supersteps; ++i ) { - const size_t lo = static_cast< size_t >( *data++ ); - const size_t no = static_cast< size_t >( *data++ ); - assert( lo < n ); - assert( no + lo <= n ); - ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >( - v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); + for( size_t k = 0; k < end[ i ]; ++k ) { + const size_t lo = static_cast< size_t >( *data++ ); + const size_t no = static_cast< size_t >( *data++ ); + assert( lo < n ); + assert( no + lo <= n ); + ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >( + v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); + } #pragma omp barrier } } From bc6939e1a61d9a4178fc320a7eaeba0f25f7e2dd Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 13:33:24 +0200 Subject: [PATCH 25/55] Update schedule ingestion accordingly --- include/graphblas/reference/matrix.hpp | 54 +++++++++++++++++++++----- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 72db3fb5e..e126d4f2a 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -291,6 +291,19 @@ namespace grb { } } + /** + * This schedule ingestion method expects the schedule as an iterator-pair + * over a container of doubly-nested vectors. Each such vector v iterated + * over, corresponds to a superstep. For a given superstep's v, v[s][k] has + * that s indicates the thread ID and k indicates the floor( k / 2 )-th range + * to be processed in that superstep and at that thread. Even-numbered k + * indicate lower bounds of the range, while odd k indicate their upper + * bounds. + * + * \note This is not an ideal way to pass around schedules, as it assumes the + * container element types are doubly-nested STL vectors-- it is hence + * not a generic approach. + */ template< typename InputType, typename RIT, typename CIT, typename NIT, typename It > @@ -318,20 +331,41 @@ namespace grb { // get buffer as an array of NIT, which we will write to in one pass assert( sptrsv.data[ s ] != nullptr ); NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]); + // get the number of ranges for a given superstep at this thread that this + // function should populate + NIT *__restrict__ end = reinterpret_cast< NIT * >(sptrsv.endPositions[ s ]); + // dynamic assertion: there should be at least one superstep assert( bounds != bounds_end ); + // start ingestion size_t count = 0; do { - const NIT l = (*bounds)[s][0]; - const NIT h = (*bounds)[s][1]; - assert( h >= l ); - const NIT n = h - l; - if( count >= sptrsv.supersteps ) { - throw std::runtime_error( "Too many supersteps" ); + assert( bounds->size() == nThreads ); + // get this superstep's and this thread's vector v[s] + const auto &v = (*bounds)[s]; + // go range-by-range + size_t nRanges = 0; + for( auto it = v.cbegin(); it != v.cend(); ++it ) { + // we have a range, parse it + const NIT l = *it++; + assert( it != v.cend() ); + const NIT h = *it + 1; + assert( h >= l ); + const NIT n = h - l; + if( count >= sptrsv.supersteps ) { + throw std::runtime_error( "Too many supersteps" ); + } + // store it + *array++ = l; + *array++ = n; + // TODO we do not / cannot double-check nRanges with the data we have + // stored at the moment. This may potentially by fixed (FIXME). + (void) ++nRanges; } - *array++ = l; - *array++ = n; - (void) count++; - (void) bounds++; + // store the number of ranges in the end array + *end++ = nRanges; + // forward to the next superstep + (void) ++count; + (void) ++bounds; } while( bounds != bounds_end ); if( count != sptrsv.supersteps ) { throw std::runtime_error( "Unexpected number of supersteps" ); From 708959cb9aed4691be7f449e8a0d544c539023cc Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 13:55:34 +0200 Subject: [PATCH 26/55] Forgot to update friend declaration --- include/graphblas/reference/matrix.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index e126d4f2a..024904b60 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -1367,11 +1367,13 @@ namespace grb { ) noexcept; template< - typename InputType, typename RIT, typename CIT, typename NIT + typename InputType, typename RIT, typename CIT, typename NIT, + typename NumRangesIt > friend void internal::allocateSptrsvSchedule( grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, - const size_t nSteps, const NIT n, const size_t nThreads + const NIT n, const size_t nThreads, const size_t nSteps, + const NumRangesIt nRanges, const NumRangesIt nRanges_end ); template< From bfe58a87ecac2b442a7e13245f804893a060f990 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 14:27:33 +0200 Subject: [PATCH 27/55] Should allow for empty ranges --- .../graphblas/reference/SptrsvSchedule.hpp | 22 ++++++++++++++----- include/graphblas/reference/blas2.hpp | 1 + include/graphblas/reference/matrix.hpp | 3 --- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index fc9fe8b7b..6d776ddde 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -179,12 +179,22 @@ namespace grb { assert( data.size() >= s ); assert( data[ s ] == nullptr ); assert( endPositions[ s ] == nullptr ); - const grb::RC rc = utils::alloc( - "grb::internal::SptrsvSchedule (default constructor)", - "default thread-local data allocation", - data[ s ], 2 * nRanges * sizeof(NIT), false, _deleters[ s ], - endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ] - ); + grb::RC rc = grb::SUCCESS; + if( nRanges > 0 ) { + rc = utils::alloc( + "grb::internal::SptrsvSchedule (default constructor)", + "default thread-local data allocation, variant I", + data[ s ], 2 * nRanges * sizeof(NIT), false, _deleters[ s ], + endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ] + ); + } else { + data[ s ] = nullptr; + rc = utils::alloc( + "grb::internal::SptrsvSchedule (default constructor)", + "default thread-local data allocation, variant II", + endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ] + ); + } if( rc != grb::SUCCESS ) { throw std::bad_alloc(); } diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 473c0c8f3..f6115995d 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2384,6 +2384,7 @@ namespace grb { reinterpret_cast< const NIT * >(sptrsv.data[ s ]); const NIT *__restrict__ const end = reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]); + assert( end != nullptr ); for( size_t i = 0; i < sptrsv.supersteps; ++i ) { for( size_t k = 0; k < end[ i ]; ++k ) { const size_t lo = static_cast< size_t >( *data++ ); diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 024904b60..787850993 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -281,9 +281,6 @@ namespace grb { throw std::runtime_error( "Unexpected number of threads" ); } std::advance( localIt, s ); - if( *localIt <= 0 ) { - throw std::runtime_error( "Unexpected number of ranges" ); - } if( sptrsv.data[ s ] || sptrsv.endPositions[ s ] ) { throw std::runtime_error( "A thread-local schedule already existed" ); } From 25684b9b597620c6b964620bba019ed433dfa769 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 14:35:17 +0200 Subject: [PATCH 28/55] I am not enthused about the exceptions that the schedule format appears to have to account for-- it may apparently also occur that the number of inner dimensions is not always constant --- include/graphblas/reference/matrix.hpp | 40 ++++++++++++++------------ 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 787850993..e3c7ac4be 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -336,27 +336,29 @@ namespace grb { // start ingestion size_t count = 0; do { - assert( bounds->size() == nThreads ); - // get this superstep's and this thread's vector v[s] - const auto &v = (*bounds)[s]; - // go range-by-range + assert( bounds->size() <= nThreads ); size_t nRanges = 0; - for( auto it = v.cbegin(); it != v.cend(); ++it ) { - // we have a range, parse it - const NIT l = *it++; - assert( it != v.cend() ); - const NIT h = *it + 1; - assert( h >= l ); - const NIT n = h - l; - if( count >= sptrsv.supersteps ) { - throw std::runtime_error( "Too many supersteps" ); + if( bounds->size() >= s ) { + // get this superstep's and this thread's vector v[s] + const auto &v = (*bounds)[s]; + // go range-by-range + for( auto it = v.cbegin(); it != v.cend(); ++it ) { + // we have a range, parse it + const NIT l = *it++; + assert( it != v.cend() ); + const NIT h = *it + 1; + assert( h >= l ); + const NIT n = h - l; + if( count >= sptrsv.supersteps ) { + throw std::runtime_error( "Too many supersteps" ); + } + // store it + *array++ = l; + *array++ = n; + // TODO we do not / cannot double-check nRanges with the data we have + // stored at the moment. This may potentially by fixed (FIXME). + (void) ++nRanges; } - // store it - *array++ = l; - *array++ = n; - // TODO we do not / cannot double-check nRanges with the data we have - // stored at the moment. This may potentially by fixed (FIXME). - (void) ++nRanges; } // store the number of ranges in the end array *end++ = nRanges; From 834e375ca39c8b8ba19609a2293951581a882b3d Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 14:42:23 +0200 Subject: [PATCH 29/55] There can be processes without any range --- include/graphblas/reference/matrix.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index e3c7ac4be..6fbe2bb11 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -326,7 +326,6 @@ namespace grb { } const size_t s = omp_get_thread_num(); // get buffer as an array of NIT, which we will write to in one pass - assert( sptrsv.data[ s ] != nullptr ); NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]); // get the number of ranges for a given superstep at this thread that this // function should populate From 471f8477a468983cc3303177d72832785761fab7 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 14:59:45 +0200 Subject: [PATCH 30/55] Partial undo of one or two commits ago -- it should not happen that there are empty inner vectors in the schedule -- something else is off --- include/graphblas/reference/matrix.hpp | 40 ++++++++++++-------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 6fbe2bb11..36a008fd0 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -335,29 +335,27 @@ namespace grb { // start ingestion size_t count = 0; do { - assert( bounds->size() <= nThreads ); + assert( bounds->size() == nThreads ); size_t nRanges = 0; - if( bounds->size() >= s ) { - // get this superstep's and this thread's vector v[s] - const auto &v = (*bounds)[s]; - // go range-by-range - for( auto it = v.cbegin(); it != v.cend(); ++it ) { - // we have a range, parse it - const NIT l = *it++; - assert( it != v.cend() ); - const NIT h = *it + 1; - assert( h >= l ); - const NIT n = h - l; - if( count >= sptrsv.supersteps ) { - throw std::runtime_error( "Too many supersteps" ); - } - // store it - *array++ = l; - *array++ = n; - // TODO we do not / cannot double-check nRanges with the data we have - // stored at the moment. This may potentially by fixed (FIXME). - (void) ++nRanges; + // get this superstep's and this thread's vector v[s] + const auto &v = (*bounds)[s]; + // go range-by-range + for( auto it = v.cbegin(); it != v.cend(); ++it ) { + // we have a range, parse it + const NIT l = *it++; + assert( it != v.cend() ); + const NIT h = *it + 1; + assert( h >= l ); + const NIT n = h - l; + if( count >= sptrsv.supersteps ) { + throw std::runtime_error( "Too many supersteps" ); } + // store it + *array++ = l; + *array++ = n; + // TODO we do not / cannot double-check nRanges with the data we have + // stored at the moment. This may potentially by fixed (FIXME). + (void) ++nRanges; } // store the number of ranges in the end array *end++ = nRanges; From 0cb629541e96180565c1712f5f7e60e17673fafd Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 15:03:21 +0200 Subject: [PATCH 31/55] Found the likely culprit! --- include/graphblas/reference/matrix.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 36a008fd0..ebe7de820 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -306,7 +306,7 @@ namespace grb { > void setSptrsvSchedule( grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, - It bounds, const It &bounds_end, + const It &bounds_begin, const It &bounds_end, const size_t nThreads ) { if( A.sptrsvSchedule == nullptr ) { @@ -320,6 +320,7 @@ namespace grb { #pragma omp parallel { + It bounds = bounds_begin; const size_t actualNumThreads = omp_get_num_threads(); if( actualNumThreads != nThreads ) { throw std::runtime_error( "Unexpected number of threads" ); From 64c64eee5739b01cdfa4069e4644ab72cdc51961 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 15:04:57 +0200 Subject: [PATCH 32/55] Also update friend declaration --- include/graphblas/reference/matrix.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index ebe7de820..06bd4eb6f 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -1378,7 +1378,7 @@ namespace grb { > friend void internal::setSptrsvSchedule( grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, - It bounds, const It &bounds_end, + const It &bounds, const It &bounds_end, const size_t nThreads ); From dd17fba5d19d5591bc8b03e9f0814defb879114c Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 18:38:21 +0200 Subject: [PATCH 33/55] Additional hardening of schedule ingestion logic --- .../graphblas/reference/SptrsvSchedule.hpp | 19 ++++++++++++++----- include/graphblas/reference/matrix.hpp | 8 ++++++-- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index 6d776ddde..543961b98 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -58,6 +58,7 @@ namespace grb { default_schedule[ 0 ] = 0; default_schedule[ 1 ] = n; default_schedule[ 2 ] = 1; + nRanges[ 0 ] = 1; // set pointers to trivial schedule data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) ); endPositions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) ); @@ -85,6 +86,7 @@ namespace grb { nThreads = toMove.nThreads; data = std::move( toMove.data ); endPositions = std::move( toMove.endPositions ); + nRanges = std::move( toMove.nRanges ); toMove.supersteps = 0; toMove.nThreads = 0; } @@ -106,6 +108,9 @@ namespace grb { /** One end-position array per thread. */ std::vector< char * > endPositions; + /** The total number of ranges per thread. */ + std::vector< NIT > nRanges; + /** Move constructor. */ SptrsvSchedule( SptrsvSchedule &&toMove ) { moveImpl( toMove ); @@ -122,7 +127,7 @@ namespace grb { */ SptrsvSchedule( const NIT n ) : _deleters( 1 ), supersteps( 1 ), nThreads( 1 ), - data( 1 ), endPositions( 1 ) + data( 1 ), endPositions( 1 ), nRanges( 1 ) { data[ 0 ] = nullptr; endPositions[ 0 ] = nullptr; @@ -140,7 +145,7 @@ namespace grb { */ SptrsvSchedule( const NIT n, const size_t T ) : _deleters( 2 * T ), supersteps( 1 ), nThreads( T ), - data( T, nullptr ), endPositions( T, nullptr ) + data( T, nullptr ), endPositions( T, nullptr ), nRanges( T, 0 ) { if( T == 0 || T > std::numeric_limits< int >::max() ) { throw std::runtime_error( "Invalid number of threads" ); @@ -172,19 +177,23 @@ namespace grb { * * Must be called from within the thread that will use it(!) */ - void alloc( const size_t s, const size_t nRanges ) { + void alloc( const size_t s, const size_t nRanges_in ) { assert( s < nThreads ); assert( supersteps > 0 ); assert( _deleters.size() >= s ); assert( data.size() >= s ); assert( data[ s ] == nullptr ); assert( endPositions[ s ] == nullptr ); + if( nRanges[ s ] != nRanges_in ) { + throw std::runtime_error( "Given nRanges differs from the one given at " + "allocation time" ); + } grb::RC rc = grb::SUCCESS; - if( nRanges > 0 ) { + if( nRanges_in > 0 ) { rc = utils::alloc( "grb::internal::SptrsvSchedule (default constructor)", "default thread-local data allocation, variant I", - data[ s ], 2 * nRanges * sizeof(NIT), false, _deleters[ s ], + data[ s ], 2 * nRanges_in * sizeof(NIT), false, _deleters[ s ], endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ] ); } else { diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 06bd4eb6f..4a6958a04 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -267,6 +267,10 @@ namespace grb { auto &sptrsv = *(A.sptrsvSchedule); sptrsv.supersteps = nSteps; + { + std::vector< NIT > tmp( nRanges, nRanges_end ); + sptrsv.nRanges = std::move( tmp ); + } // disable default trivial schedule sptrsv.data[ 0 ] = nullptr; @@ -354,11 +358,11 @@ namespace grb { // store it *array++ = l; *array++ = n; - // TODO we do not / cannot double-check nRanges with the data we have - // stored at the moment. This may potentially by fixed (FIXME). (void) ++nRanges; + assert( nRanges < sptrsv.nRanges[ s ] ); } // store the number of ranges in the end array + assert( count < sptrsv.supersteps ); *end++ = nRanges; // forward to the next superstep (void) ++count; From a5b545488dbdb0c99c5a32303d2143593ec33b44 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 18:41:38 +0200 Subject: [PATCH 34/55] Metabugfix --- include/graphblas/reference/matrix.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 4a6958a04..282f4c504 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -359,7 +359,7 @@ namespace grb { *array++ = l; *array++ = n; (void) ++nRanges; - assert( nRanges < sptrsv.nRanges[ s ] ); + assert( nRanges <= sptrsv.nRanges[ s ] ); } // store the number of ranges in the end array assert( count < sptrsv.supersteps ); From f9bf41d7d5ee880c43fd6ef76ff2d00d893bb18e Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 18:46:38 +0200 Subject: [PATCH 35/55] Silly bug causing difficult-to-track memory errors, apologies --- include/graphblas/reference/SptrsvSchedule.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index 543961b98..743844e32 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -194,14 +194,14 @@ namespace grb { "grb::internal::SptrsvSchedule (default constructor)", "default thread-local data allocation, variant I", data[ s ], 2 * nRanges_in * sizeof(NIT), false, _deleters[ s ], - endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ] + endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + T ] ); } else { data[ s ] = nullptr; rc = utils::alloc( "grb::internal::SptrsvSchedule (default constructor)", "default thread-local data allocation, variant II", - endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ 2 * s ] + endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + T ] ); } if( rc != grb::SUCCESS ) { From 573c80d8f0a614bd636f84ff6c11aa41ad4f9073 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 18:48:13 +0200 Subject: [PATCH 36/55] Fix typo --- include/graphblas/reference/SptrsvSchedule.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index 743844e32..214f5ff60 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -194,14 +194,14 @@ namespace grb { "grb::internal::SptrsvSchedule (default constructor)", "default thread-local data allocation, variant I", data[ s ], 2 * nRanges_in * sizeof(NIT), false, _deleters[ s ], - endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + T ] + endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + nThreads ] ); } else { data[ s ] = nullptr; rc = utils::alloc( "grb::internal::SptrsvSchedule (default constructor)", "default thread-local data allocation, variant II", - endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + T ] + endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + nThreads ] ); } if( rc != grb::SUCCESS ) { From 4f42d4eb0d5ca9bb4b12639d8950b4f85c4fdeeb Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 10 Apr 2025 18:52:36 +0200 Subject: [PATCH 37/55] Move assertion to before writing to the location the assert checks --- include/graphblas/reference/matrix.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 282f4c504..f768b2a1a 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -356,10 +356,10 @@ namespace grb { throw std::runtime_error( "Too many supersteps" ); } // store it + assert( nRanges < sptrsv.nRanges[ s ] ); *array++ = l; *array++ = n; (void) ++nRanges; - assert( nRanges <= sptrsv.nRanges[ s ] ); } // store the number of ranges in the end array assert( count < sptrsv.supersteps ); From fc968dc651897c43a7261b3f5d3efc7ee1e60402 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 11 Apr 2025 11:14:59 +0200 Subject: [PATCH 38/55] Fix meta-bug --- include/graphblas/reference/blas2.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index f6115995d..943c15bbb 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2358,10 +2358,6 @@ namespace grb { const Division &division, const Phase &phase ) { - // dynamic sanity checks - assert( grb::nrows( T ) == n ); - assert( grb::ncols( T ) == n ); - // in dense unmasked, resize is a no-op if( phase == grb::RESIZE ) { return grb::SUCCESS; } From 3f5c4c1f8f444e91106ef3ab4cd089493425fc81 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 11 Apr 2025 12:14:47 +0200 Subject: [PATCH 39/55] Add back support for simple schedules, as the other OSP sptrsv variants uses that --- .../graphblas/reference/SptrsvSchedule.hpp | 11 ++++- include/graphblas/reference/blas2.hpp | 42 ++++++++++++++----- include/graphblas/reference/matrix.hpp | 9 ++++ 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index 214f5ff60..8811f80cc 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -74,6 +74,8 @@ namespace grb { assert( *interpreted == 1 ); } #endif + // note that this is a trivial schedule + is_simple = true; } /** @@ -82,11 +84,13 @@ namespace grb { void moveImpl( SptrsvSchedule &&toMove ) { _deleters = std::move( toMove._deleters ); default_schedule = std::move( toMove.default_schedule ); + is_simple = toMove.is_simple; supersteps = toMove.supersteps; nThreads = toMove.nThreads; data = std::move( toMove.data ); endPositions = std::move( toMove.endPositions ); nRanges = std::move( toMove.nRanges ); + toMove.is_simple = false; toMove.supersteps = 0; toMove.nThreads = 0; } @@ -94,6 +98,9 @@ namespace grb { public: + /** Whether the schedule is simple.*/ + bool is_simple; + /** Number of schedule steps. */ size_t supersteps; @@ -126,7 +133,7 @@ namespace grb { * first (and only) thread. */ SptrsvSchedule( const NIT n ) : - _deleters( 1 ), supersteps( 1 ), nThreads( 1 ), + _deleters( 1 ), is_simple( false ), supersteps( 1 ), nThreads( 1 ), data( 1 ), endPositions( 1 ), nRanges( 1 ) { data[ 0 ] = nullptr; @@ -144,7 +151,7 @@ namespace grb { * a single-superstep schedule that assigns all work to the first thread. */ SptrsvSchedule( const NIT n, const size_t T ) : - _deleters( 2 * T ), supersteps( 1 ), nThreads( T ), + _deleters( 2 * T ), is_simple( false ), supersteps( 1 ), nThreads( T ), data( T, nullptr ), endPositions( T, nullptr ), nRanges( T, 0 ) { if( T == 0 || T > std::numeric_limits< int >::max() ) { diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 943c15bbb..c431614a6 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2373,24 +2373,44 @@ namespace grb { // start parallel section grb::RC ret = grb::SUCCESS; - #pragma omp parallel num_threads(sptrsv.nThreads) - { - const int s = omp_get_thread_num(); - const NIT *__restrict__ data = - reinterpret_cast< const NIT * >(sptrsv.data[ s ]); - const NIT *__restrict__ const end = - reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]); - assert( end != nullptr ); - for( size_t i = 0; i < sptrsv.supersteps; ++i ) { - for( size_t k = 0; k < end[ i ]; ++k ) { + + if( sptrsv.is_simple ) { + #pragma omp parallel num_threads(sptrsv.nThreads) + { + const int s = omp_get_thread_num(); + const NIT *__restrict__ data = + reinterpret_cast< const NIT * >(sptrsv.data[ s ]); + assert( data != nullptr ); + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { const size_t lo = static_cast< size_t >( *data++ ); const size_t no = static_cast< size_t >( *data++ ); assert( lo < n ); assert( no + lo <= n ); ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >( v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); + #pragma omp barrier + } + } + } else { + #pragma omp parallel num_threads(sptrsv.nThreads) + { + const int s = omp_get_thread_num(); + const NIT *__restrict__ data = + reinterpret_cast< const NIT * >(sptrsv.data[ s ]); + const NIT *__restrict__ const end = + reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]); + assert( end != nullptr ); + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + for( size_t k = 0; k < end[ i ]; ++k ) { + const size_t lo = static_cast< size_t >( *data++ ); + const size_t no = static_cast< size_t >( *data++ ); + assert( lo < n ); + assert( no + lo <= n ); + ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >( + v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); + } + #pragma omp barrier } - #pragma omp barrier } } diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index f768b2a1a..00084ca75 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -322,8 +322,10 @@ namespace grb { "nThreads" ); } + sptrsv.is_simple = true; #pragma omp parallel { + bool simple_local = true; It bounds = bounds_begin; const size_t actualNumThreads = omp_get_num_threads(); if( actualNumThreads != nThreads ) { @@ -363,6 +365,9 @@ namespace grb { } // store the number of ranges in the end array assert( count < sptrsv.supersteps ); + if( nRanges != 1 ) { + simple_local = false; + } *end++ = nRanges; // forward to the next superstep (void) ++count; @@ -371,6 +376,10 @@ namespace grb { if( count != sptrsv.supersteps ) { throw std::runtime_error( "Unexpected number of supersteps" ); } + if( !simple_local ) { + #pragma omp critical + sptrsv.is_simple = false; + } } } From cd2c9674cd3589638f0f6dcfeb50569e50ea9674 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 11 Apr 2025 16:25:47 +0200 Subject: [PATCH 40/55] I rehashed a classic performance bug there, apologies -- now fixed! --- include/graphblas/reference/blas2.hpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index c431614a6..b6af9b23f 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2377,6 +2377,7 @@ namespace grb { if( sptrsv.is_simple ) { #pragma omp parallel num_threads(sptrsv.nThreads) { + grb::RC local_rc = grb::SUCCESS; const int s = omp_get_thread_num(); const NIT *__restrict__ data = reinterpret_cast< const NIT * >(sptrsv.data[ s ]); @@ -2386,14 +2387,21 @@ namespace grb { const size_t no = static_cast< size_t >( *data++ ); assert( lo < n ); assert( no + lo <= n ); - ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >( - v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); + local_rc = local_rc + ? local_rc + : dense_unmasked_sequential_sptrsv< descr, true >( + v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); #pragma omp barrier } + if( local_rc != grb::SUCCESS ) { + #pragma omp atomic write + ret = local_rc; + } } } else { #pragma omp parallel num_threads(sptrsv.nThreads) { + grb::RC local_rc = grb::SUCCESS; const int s = omp_get_thread_num(); const NIT *__restrict__ data = reinterpret_cast< const NIT * >(sptrsv.data[ s ]); @@ -2406,11 +2414,17 @@ namespace grb { const size_t no = static_cast< size_t >( *data++ ); assert( lo < n ); assert( no + lo <= n ); - ret = ret ? ret : dense_unmasked_sequential_sptrsv< descr, true >( - v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); + local_rc = local_rc + ? local_rc + : dense_unmasked_sequential_sptrsv< descr, true >( + v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); } #pragma omp barrier } + if( local_rc != grb::SUCCESS ) { + #pragma omp atomic write + ret = local_rc; + } } } From 3e0eda86a12ab8edb9f83cdd5960f9f9668818ce Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Fri, 11 Apr 2025 17:21:02 +0200 Subject: [PATCH 41/55] Another tricky performance bug-- and a silly mistake once more... --- include/graphblas/reference/blas2.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index b6af9b23f..eb0f4bf7f 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2286,7 +2286,7 @@ namespace grb { // switch forward or backward solve if( forward ) { const auto &crs = internal::getCRS( T ); - if( maybe_offset ) { + if( !maybe_offset ) { for( size_t i = 0; i < n; ++i ) { IOType divBy = semiring.template getZero< IOType >(); assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); From ad392992e1b381163975e05fd363213e38abe9f9 Mon Sep 17 00:00:00 2001 From: tonibohnlein <113011201+tonibohnlein@users.noreply.github.com> Date: Mon, 14 Apr 2025 11:05:08 +0200 Subject: [PATCH 42/55] fixed bug in blas2.hpp --- include/graphblas/reference/blas2.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index eb0f4bf7f..9935e9151 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2298,7 +2298,8 @@ namespace grb { (void) grb::foldl( v_raw[ i ], divBy, division ); } } else { - for( size_t i = offset; i < n; ++i ) { + for( size_t j = 0; j < n; ++j ) { + const size_t i = j + offset; IOType divBy = semiring.template getZero< IOType >(); assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction ); From ba98f0bf0633faac73cc13045a134d5eece17819 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Mon, 14 Apr 2025 13:00:57 +0200 Subject: [PATCH 43/55] Missing optimisation introduced --- include/graphblas/reference/blas2.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 9935e9151..d2c72fe2a 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2286,7 +2286,7 @@ namespace grb { // switch forward or backward solve if( forward ) { const auto &crs = internal::getCRS( T ); - if( !maybe_offset ) { + if( !maybe_offset || offset == 0 ) { for( size_t i = 0; i < n; ++i ) { IOType divBy = semiring.template getZero< IOType >(); assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); From 220b6f86b56a9c81d4b2d02e87caba8a550bf2ab Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Mon, 14 Apr 2025 14:36:19 +0200 Subject: [PATCH 44/55] Changes towards supporting sptrsv on sorted matrices --- include/graphblas/reference/blas2.hpp | 115 +++++++++++++++++++++----- 1 file changed, 96 insertions(+), 19 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index d2c72fe2a..a5781d0aa 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2217,8 +2217,17 @@ namespace grb { namespace internal { #ifndef _H_GRB_REFERENCE_OMP_BLAS2 - /** Backend-independent sptrsv kernel code. */ + /** + * Backend-independent sptrsv kernel code. + * + * @tparam sorted 0: the matrix rows are not sorted, + * 1: the matrix rows have diagonal elements at position 0, + * 2: the matrix rows are sorted. + * + * \note The above description assumes CRS. + */ template< + int sorted, typename IOType, typename InputType1, typename IND, typename NIT, class Semiring, class Subtraction @@ -2227,22 +2236,40 @@ namespace grb { const Compressed_Storage< InputType1, IND, NIT > &crs, const size_t i, IOType * const x, + const bool forward, IOType &divBy, const Semiring &semiring, const Subtraction &subtraction ) { + // static assertions + static_assert( sorted >= 0 && sorted < 3, "Illegal value for sorted; this " + "is an internal error, please submit a bug report" ); constexpr auto one = Semiring::template One< typename Semiring::D1 >::value(); + if( sorted ) { + (void) divBy; + } + assert( crs.col_start[ i + 1 ] > crs.col_start[ i ] ); + const size_t start = (!sorted) ? crs.col_start[ i ] : ( + (sorted == 1) ? (crs.col_start[ i ] + 1) : ( + forward ? crs.col_start[ i ] : (crs.col_start[ i ] + 1) ) + ); + const size_t end = (!sorted) ? crs.col_start[ i + 1 ] : ( + (sorted == 1) ? crs.col_start[ i + 1 ] : ( + forward ? (crs.col_start[ i + 1 ] - 1) : crs.col_start[ i + 1 ] ) + ); // TODO could do some ALP-style vectorisation here - for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { + for( size_t k = start; k < end; ++k ) { const typename Semiring::D1 val = crs.template getValue( k, one ); const auto &ind = crs.row_index[ k ]; - if( static_cast< size_t >(ind) == i ) { + if( !sorted ) { + if( static_cast< size_t >(ind) == i ) { #ifdef _DEBUG - std::cout << "\tskipping nonzero at " << i ", " << i << "\n"; + std::cout << "\tskipping nonzero at " << i ", " << i << "\n"; #endif - divBy = val; - continue; + divBy = val; + continue; + } } #ifdef _DEBUG std::cout << "\t x[ " << i << " ] (" << x[i] << ") -= " << val << "* x[ " @@ -2254,11 +2281,27 @@ namespace grb { (void) grb::foldl( x[ i ], tmp, subtraction ); } } -#endif + + template< + typename IOType, int sorted, + typename VIT, typename RCIT, typename NIT + > + inline static IOType getDiagonalEntry( + const Compressed_Storage< VIT, RCIT, NIT > &storage, const size_t &i, + const bool &forward, const IOType &one + ) { + assert( sorted >= 1 && sorted < 3 ); + assert( storage.col_start[ i + 1 ] > storage.col_start[ i ] ); + const size_t diag_index = sorted == 1 ? storage.col_start[ i ] : ( + forward ? (storage.col_start[ i + 1 ] - 1) : storage.col_start[ i ] ); + assert( storage.col_index[ diag_index ] == i ); + return storage.template getValue( diag_index, one ); + } +#endif // end ifndef _H_GRB_REFERENCE_OMP_BLAS2 /** \internal Specialised dense unmasked sptrsv implementation, sequential */ template< - Descriptor descr, bool maybe_offset, + Descriptor descr, bool maybe_offset, int sorted, class Semiring, class Subtraction, class Division, typename IOType, typename InputType1, typename RIT, typename CIT, typename NIT @@ -2273,6 +2316,10 @@ namespace grb { const Division &division, const Phase &phase ) { + // static sanity checks + static_assert( sorted >= 0 && sorted < 3, "Invalid value for sorted; this " + "an internal error, please submit a bug report" ); + // dynamic sanity checks assert( grb::nrows( T ) == n ); assert( grb::ncols( T ) == n ); @@ -2283,14 +2330,26 @@ namespace grb { // only execute and resize are supported assert( phase == grb::EXECUTE ); + // in case of pattern matrices, get 1 from the semiring + const IOType one = semiring.getMultiplicativeMonoid().template + getIdentity< IOType >(); + // switch forward or backward solve if( forward ) { const auto &crs = internal::getCRS( T ); if( !maybe_offset || offset == 0 ) { for( size_t i = 0; i < n; ++i ) { - IOType divBy = semiring.template getZero< IOType >(); + IOType divBy; + if( !sorted ) { + // in this case we will auto-detect the diagonal item + divBy = semiring.template getZero< IOType >(); + } else { + divBy = getDiagonalEntry< IOType, sorted, InputType1, RIT, NIT >( + crs, i, forward, one ); + } assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); - sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction ); + sptrsv_kernel< sorted >( crs, i, v_raw, forward, divBy, semiring, + subtraction ); #ifdef _DEBUG std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy << "\n"; @@ -2300,9 +2359,15 @@ namespace grb { } else { for( size_t j = 0; j < n; ++j ) { const size_t i = j + offset; - IOType divBy = semiring.template getZero< IOType >(); + IOType divBy; + if( !sorted ) { + divBy = semiring.template getZero< IOType >(); + } else { + divBy = getDiagonalEntry< IOType, sorted >( crs, i, forward, one ); + } assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); - sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction ); + sptrsv_kernel< sorted >( crs, i, v_raw, forward, divBy, semiring, + subtraction ); #ifdef _DEBUG std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy << "\n"; @@ -2314,9 +2379,15 @@ namespace grb { const auto &ccs = internal::getCCS( T ); if( !maybe_offset || offset == 0 ) { for( size_t i = n - 1; i < n; --i ) { - IOType divBy = semiring.template getZero< IOType >(); + IOType divBy; + if( !sorted ) { + divBy = semiring.template getZero< IOType >(); + } else { + divBy = getDiagonalEntry< IOType, sorted >( ccs, i, forward, one ); + } assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); - sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction ); + sptrsv_kernel< sorted >( ccs, i, v_raw, forward, divBy, semiring, + subtraction ); #ifdef _DEBUG std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy << "\n"; @@ -2325,9 +2396,15 @@ namespace grb { } } else { for( size_t i = n - 1 + offset; i >= offset; --i ) { - IOType divBy = semiring.template getZero< IOType >(); + IOType divBy; + if( !sorted ) { + divBy = semiring.template getZero< IOType >(); + } else { + divBy = getDiagonalEntry< IOType, sorted >( ccs, i, forward, one ); + } assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); - sptrsv_kernel( ccs, i, v_raw, divBy, semiring, subtraction ); + sptrsv_kernel< sorted >( ccs, i, v_raw, forward, divBy, semiring, + subtraction ); #ifdef _DEBUG std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy << "\n"; @@ -2390,7 +2467,7 @@ namespace grb { assert( no + lo <= n ); local_rc = local_rc ? local_rc - : dense_unmasked_sequential_sptrsv< descr, true >( + : dense_unmasked_sequential_sptrsv< descr, true, 1 >( v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); #pragma omp barrier } @@ -2417,7 +2494,7 @@ namespace grb { assert( no + lo <= n ); local_rc = local_rc ? local_rc - : dense_unmasked_sequential_sptrsv< descr, true >( + : dense_unmasked_sequential_sptrsv< descr, true, 1 >( v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); } #pragma omp barrier @@ -2533,7 +2610,7 @@ namespace grb { if( dense || grb::nnz( xb ) == n ) { IOType * const xb_p = internal::getRaw( xb ); #ifndef _H_GRB_REFERENCE_OMP_BLAS2 - return internal::dense_unmasked_sequential_sptrsv< descr, false >( + return internal::dense_unmasked_sequential_sptrsv< descr, false, 0 >( xb_p, T, 0, n, forward, semiring, subtraction, division, phase ); #else return internal::dense_unmasked_omp_sptrsv< descr >( From abc397bc3d33229a9079d9dad8de18dabc22ec13 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Mon, 14 Apr 2025 14:43:07 +0200 Subject: [PATCH 45/55] Move index computation into for-loop --- include/graphblas/reference/blas2.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 9935e9151..b76786384 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2298,8 +2298,8 @@ namespace grb { (void) grb::foldl( v_raw[ i ], divBy, division ); } } else { - for( size_t j = 0; j < n; ++j ) { - const size_t i = j + offset; + const size_t end = n + offset; + for( size_t i = offset; i < end; ++i ) { IOType divBy = semiring.template getZero< IOType >(); assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); sptrsv_kernel( crs, i, v_raw, divBy, semiring, subtraction ); From 4ec4720b29a78d91c697d30a410043948e417736 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Mon, 14 Apr 2025 17:00:15 +0200 Subject: [PATCH 46/55] Tried and failed vectorisation --- include/graphblas/reference/blas2.hpp | 65 ++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index b76786384..860fc2ffe 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2233,7 +2233,70 @@ namespace grb { ) { constexpr auto one = Semiring::template One< typename Semiring::D1 >::value(); - // TODO could do some ALP-style vectorisation here + + // NOTE: explicit vectorisation seems to cause significant slowdowns. + // disabled but retained for future reference / more successful + // optimisation + /* + // constants + constexpr size_t ind_blocksize = + grb::config::SIMD_BLOCKSIZE< IND >::value(); + constexpr size_t blocksize = ind_blocksize < Semiring::blocksize + ? ind_blocksize : Semiring::blocksize; + // constant bufer + IND cur_ind[ blocksize ]; + // SIMD set + for( size_t r = 0; r < blocksize; ++r ) { + cur_ind[ r ] = i; + } + for( ; k + Semiring::blocksize < crs.col_start[ i + 1 ]; + k += Semiring::blocksize + ) { + typename Semiring::D1 values[ blocksize ]; + typename Semiring::D3 tmp[ blocksize ]; + IOType x_simd[ blocksize ]; + IND indices[ blocksize ]; + bool match[ blocksize ]; + // streaming loads + for( size_t r = 0; r < blocksize; ++r ) { + values[ r ] = crs.template getValue( k + r, one ); + } + for( size_t r = 0; r < blocksize; ++r ) { + indices[ r ] = crs.row_index[ k + r ]; + } + // SIMD compute + for( size_t r = 0; r < blocksize; ++r ) { + match[ r ] = cur_ind[ r ] != indices[ r ]; + } + // gather + for( size_t r = 0; r < blocksize; ++r ) { + if( match[ r ] ) { + x_simd[ r ] = x[ indices[ r ] ]; + } + } + // masked SIMD compute + for( size_t r = 0; r < blocksize; ++r ) { + if( match[ r ] ) { + tmp[ r ] = values[ r ] * x_simd[ r ]; + } + } + // masked SIMD reduce + for( size_t r = 0; r < blocksize; ++r ) { + if( match[ r ] ) { + reduced += tmp[ r ]; + } + } + // diagonal detection; SIMD invert and reduce + for( size_t r = 0; r < blocksize; ++r ) { + match[ r ] = !match[ r ]; + } + for( size_t r = 0; r < blocksize; ++r ) { + if( match[ r ] ) { + divBy += values[ r ]; + } + } + }*/ + for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { const typename Semiring::D1 val = crs.template getValue( k, one ); const auto &ind = crs.row_index[ k ]; From 19b9473a70c973ada778dc0d4bdf288bc358aa70 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Mon, 14 Apr 2025 17:09:44 +0200 Subject: [PATCH 47/55] Const-reference passing where possible --- include/graphblas/reference/blas2.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 860fc2ffe..41c6c8da0 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2225,8 +2225,8 @@ namespace grb { > inline void sptrsv_kernel( const Compressed_Storage< InputType1, IND, NIT > &crs, - const size_t i, - IOType * const x, + const size_t &i, + IOType *__restrict__ const &x, IOType &divBy, const Semiring &semiring, const Subtraction &subtraction @@ -2327,10 +2327,10 @@ namespace grb { typename RIT, typename CIT, typename NIT > RC dense_unmasked_sequential_sptrsv( - IOType *__restrict__ const v_raw, + IOType *__restrict__ const &v_raw, const Matrix< InputType1, reference, RIT, CIT, NIT > &T, - const size_t offset, const size_t &n, - const bool forward, + const size_t &offset, const size_t &n, + const bool &forward, const Semiring &semiring, const Subtraction &subtraction, const Division &division, @@ -2413,10 +2413,10 @@ namespace grb { typename RIT, typename CIT, typename NIT > RC dense_unmasked_omp_sptrsv( - IOType *__restrict__ const v_raw, + IOType *__restrict__ const &v_raw, const Matrix< InputType1, reference, RIT, CIT, NIT > &T, const size_t &n, - const bool forward, + const bool &forward, const Semiring &semiring, const Subtraction &subtraction, const Division &division, @@ -2474,8 +2474,8 @@ namespace grb { assert( end != nullptr ); for( size_t i = 0; i < sptrsv.supersteps; ++i ) { for( size_t k = 0; k < end[ i ]; ++k ) { - const size_t lo = static_cast< size_t >( *data++ ); - const size_t no = static_cast< size_t >( *data++ ); + const size_t &lo = static_cast< size_t >( *data++ ); + const size_t &no = static_cast< size_t >( *data++ ); assert( lo < n ); assert( no + lo <= n ); local_rc = local_rc From 90c12b82ece6e1de2fdf0e50930b772adc222dab Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Wed, 16 Apr 2025 13:18:17 +0200 Subject: [PATCH 48/55] Add in CRS column-sorting as a pre-processing / tuning step for SpTrsv --- include/graphblas/reference/matrix.hpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 00084ca75..e4f10a7b4 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -322,6 +322,30 @@ namespace grb { "nThreads" ); } + // re-sort A + { + const auto &crs = internal::getCRS( A ); + for( size_t i = 0; i < A.m; ++i ) { + std::vector< std::pair< int, double > > pairs; + for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { + pairs.push_back( std::make_pair( crs.row_index[ k ], crs.values[ k ] ) ); + } + std::sort( pairs.begin(), pairs.end(), + []( const std::pair< int, double > &left, const std::pair< int, double > &right ) { + return left.first < right.first; + } ); + auto it = pairs.cbegin(); + //std::cout << "Row " << i << ": "; + for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) { + //std::cout << it->first << ", "; + assert( it != pairs.cend() ); + crs.row_index[ k ] = it->first; + crs.values[ k ] = it->second; + } + //std::cout << std::endl; + } + } + sptrsv.is_simple = true; #pragma omp parallel { From 0a0fe6f52d23334ef646d680b3a46183263b6f39 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Wed, 16 Apr 2025 13:25:18 +0200 Subject: [PATCH 49/55] Remove debug statements, also sort CCS, and parallelise the sorting --- include/graphblas/reference/matrix.hpp | 39 +++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index e4f10a7b4..27e97cb22 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -322,9 +322,16 @@ namespace grb { "nThreads" ); } - // re-sort A + // re-sort A, CRS first { const auto &crs = internal::getCRS( A ); + const size_t nThreads = A.m < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) for( size_t i = 0; i < A.m; ++i ) { std::vector< std::pair< int, double > > pairs; for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { @@ -335,14 +342,38 @@ namespace grb { return left.first < right.first; } ); auto it = pairs.cbegin(); - //std::cout << "Row " << i << ": "; for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) { - //std::cout << it->first << ", "; assert( it != pairs.cend() ); crs.row_index[ k ] = it->first; crs.values[ k ] = it->second; } - //std::cout << std::endl; + } + } + // then CCS + { + const auto &ccs = internal::getCCS( A ); + const size_t nThreads = A.n < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) + for( size_t i = 0; i < A.n; ++i ) { + std::vector< std::pair< int, double > > pairs; + for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k ) { + pairs.push_back( std::make_pair( ccs.row_index[ k ], ccs.values[ k ] ) ); + } + std::sort( pairs.begin(), pairs.end(), + []( const std::pair< int, double > &left, const std::pair< int, double > &right ) { + return left.first < right.first; + } ); + auto it = pairs.cbegin(); + for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k, ++it ) { + assert( it != pairs.cend() ); + ccs.row_index[ k ] = it->first; + ccs.values[ k ] = it->second; + } } } From 38534246d83233c04a0bf4a05f76bf48dc2da81b Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Wed, 16 Apr 2025 13:28:48 +0200 Subject: [PATCH 50/55] Save testing code, assuming sorted arrays --- include/graphblas/reference/blas2.hpp | 35 +++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 41c6c8da0..fe9588d15 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2599,8 +2599,43 @@ namespace grb { return internal::dense_unmasked_sequential_sptrsv< descr, false >( xb_p, T, 0, n, forward, semiring, subtraction, division, phase ); #else + // The below code is only for testing (DBG): + #if 0 + if( phase == grb::RESIZE ) { return grb::SUCCESS; } + const auto sptrsv = internal::getSptrsvData( T ); + #pragma omp parallel num_threads(sptrsv->nThreads) + { + const auto &crs = internal::getCRS( T ); + const int *__restrict__ schedule = reinterpret_cast< int * >( + sptrsv->data[ omp_get_thread_num() ] ); + for( size_t step = 0; step < sptrsv->supersteps; ++step ) { + const int &lo = *schedule++; + const int &no = *schedule++; + const int upper_limit = lo + no; + for( int row_idx = lo; row_idx < upper_limit; ++row_idx ) { + // if not sorted, enable this variant instead: + // IOType div = 0; + // for( unsigned int k = crs.col_start[ row_idx ]; k < crs.col_start[ row_idx + 1 ]; ++k ) { + for( unsigned int k = crs.col_start[ row_idx ]; k < crs.col_start[ row_idx + 1 ] - 1; ++k ) { + const int &j = crs.row_index[ k ]; + const double &value = crs.values[ k ]; + // if not sorted, enable this if-else: + // if( j == row_idx ) { + // div = value; + // } else { + xb_p[ row_idx ] -= value * xb_p[ j ]; + //} + } + xb_p[ row_idx ] /= crs.values[ crs.col_start[ row_idx + 1 ] - 1 ]; + } + #pragma omp barrier + } + } + return grb::SUCCESS; + #else return internal::dense_unmasked_omp_sptrsv< descr >( xb_p, T, n, forward, semiring, subtraction, division, phase ); + #endif #endif } else { grb::Vector< IOType, reference, Coords > no_mask( 0 ); From 555bc5f1ad83cb232b4c754199242e8f5de3cbbd Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 17 Apr 2025 13:05:48 +0200 Subject: [PATCH 51/55] Make sorted-ness configurable --- .../graphblas/reference/SptrsvSchedule.hpp | 14 +- include/graphblas/reference/blas2.hpp | 74 ++++++--- include/graphblas/reference/config.hpp | 29 +++- include/graphblas/reference/matrix.hpp | 145 ++++++++++++------ 4 files changed, 192 insertions(+), 70 deletions(-) diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp index 8811f80cc..ea25e8229 100644 --- a/include/graphblas/reference/SptrsvSchedule.hpp +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -76,6 +76,9 @@ namespace grb { #endif // note that this is a trivial schedule is_simple = true; + + // the default schedule does not (cannot) sort input matrices + is_sorted = false; } /** @@ -84,12 +87,14 @@ namespace grb { void moveImpl( SptrsvSchedule &&toMove ) { _deleters = std::move( toMove._deleters ); default_schedule = std::move( toMove.default_schedule ); + is_sorted = toMove.is_sorted; is_simple = toMove.is_simple; supersteps = toMove.supersteps; nThreads = toMove.nThreads; data = std::move( toMove.data ); endPositions = std::move( toMove.endPositions ); nRanges = std::move( toMove.nRanges ); + toMove.is_sorted = true; toMove.is_simple = false; toMove.supersteps = 0; toMove.nThreads = 0; @@ -98,6 +103,9 @@ namespace grb { public: + /** Whether the matrix is sorted as part of tuning. */ + bool is_sorted; + /** Whether the schedule is simple.*/ bool is_simple; @@ -133,7 +141,8 @@ namespace grb { * first (and only) thread. */ SptrsvSchedule( const NIT n ) : - _deleters( 1 ), is_simple( false ), supersteps( 1 ), nThreads( 1 ), + _deleters( 1 ), is_sorted( true ), is_simple( false ), + supersteps( 1 ), nThreads( 1 ), data( 1 ), endPositions( 1 ), nRanges( 1 ) { data[ 0 ] = nullptr; @@ -151,7 +160,8 @@ namespace grb { * a single-superstep schedule that assigns all work to the first thread. */ SptrsvSchedule( const NIT n, const size_t T ) : - _deleters( 2 * T ), is_simple( false ), supersteps( 1 ), nThreads( T ), + _deleters( 2 * T ), is_sorted( true ), is_simple( false ), + supersteps( 1 ), nThreads( T ), data( T, nullptr ), endPositions( T, nullptr ), nRanges( T, 0 ) { if( T == 0 || T > std::numeric_limits< int >::max() ) { diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 03aad374d..a623b94b4 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2523,15 +2523,31 @@ namespace grb { const NIT *__restrict__ data = reinterpret_cast< const NIT * >(sptrsv.data[ s ]); assert( data != nullptr ); - for( size_t i = 0; i < sptrsv.supersteps; ++i ) { - const size_t lo = static_cast< size_t >( *data++ ); - const size_t no = static_cast< size_t >( *data++ ); - assert( lo < n ); - assert( no + lo <= n ); - local_rc = local_rc - ? local_rc - : dense_unmasked_sequential_sptrsv< descr, true, 1 >( - v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); + if( sptrsv.is_sorted ) { + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + const size_t lo = static_cast< size_t >( *data++ ); + const size_t no = static_cast< size_t >( *data++ ); + assert( lo < n ); + assert( no + lo <= n ); + local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< + descr, true, config::tuning::SpTRSV::sortingMode + >( + v_raw, T, lo, no, forward, semiring, subtraction, division, phase + ); + } + #pragma omp barrier + } else { + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + const size_t lo = static_cast< size_t >( *data++ ); + const size_t no = static_cast< size_t >( *data++ ); + assert( lo < n ); + assert( no + lo <= n ); + local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< + descr, true, 0 + >( + v_raw, T, lo, no, forward, semiring, subtraction, division, phase + ); + } #pragma omp barrier } if( local_rc != grb::SUCCESS ) { @@ -2549,18 +2565,36 @@ namespace grb { const NIT *__restrict__ const end = reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]); assert( end != nullptr ); - for( size_t i = 0; i < sptrsv.supersteps; ++i ) { - for( size_t k = 0; k < end[ i ]; ++k ) { - const size_t &lo = static_cast< size_t >( *data++ ); - const size_t &no = static_cast< size_t >( *data++ ); - assert( lo < n ); - assert( no + lo <= n ); - local_rc = local_rc - ? local_rc - : dense_unmasked_sequential_sptrsv< descr, true, 1 >( - v_raw, T, lo, no, forward, semiring, subtraction, division, phase ); + if( sptrsv.is_sorted ) { + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + for( size_t k = 0; k < end[ i ]; ++k ) { + const size_t &lo = static_cast< size_t >( *data++ ); + const size_t &no = static_cast< size_t >( *data++ ); + assert( lo < n ); + assert( no + lo <= n ); + local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< + descr, true, config::tuning::SpTRSV::sortingMode + >( + v_raw, T, lo, no, forward, semiring, subtraction, division, phase + ); + } + #pragma omp barrier + } + } else { + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + for( size_t k = 0; k < end[ i ]; ++k ) { + const size_t &lo = static_cast< size_t >( *data++ ); + const size_t &no = static_cast< size_t >( *data++ ); + assert( lo < n ); + assert( no + lo <= n ); + local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< + descr, true, 0 + >( + v_raw, T, lo, no, forward, semiring, subtraction, division, phase + ); + } + #pragma omp barrier } - #pragma omp barrier } if( local_rc != grb::SUCCESS ) { #pragma omp atomic write diff --git a/include/graphblas/reference/config.hpp b/include/graphblas/reference/config.hpp index 82e067ffd..1d9a8ebbf 100644 --- a/include/graphblas/reference/config.hpp +++ b/include/graphblas/reference/config.hpp @@ -333,7 +333,34 @@ namespace grb { /** @} */ - } // namespace config + + /** + * Collects all optimisations with regards to tuning operations for specific + * inputs, i.e., #grb::PHASE::TUNE. + */ + namespace tuning { + + /** + * Collects all options for SpTRSV tuning. + */ + struct SpTRSV { + + /** + * There are three sorting modes implemented: + * - 0: the matrix rows are not sorted; + * - 1: the matrix rows have diagonal elements at position 0; + * - 2: the matrix rows are sorted. + * The above description assumes CRS, however, the chosen sorting mode is + * also applied to the CCS (in which case the above three points apply with + * `rows' substituted by `columns'). + */ + static constexpr int sortingMode = 2; + + }; + + } // namespace grb::config::tuning + + } // namespace grb::config } // namespace grb diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 27e97cb22..6b7cb29ba 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -322,59 +322,110 @@ namespace grb { "nThreads" ); } - // re-sort A, CRS first - { - const auto &crs = internal::getCRS( A ); - const size_t nThreads = A.m < config::OMP::minLoopSize() - ? 1 - : std::min( - config::OMP::threads(), - A.nz / config::CACHE_LINE_SIZE::value() - ); - #pragma omp parallel for num_threads( nThreads ) - for( size_t i = 0; i < A.m; ++i ) { - std::vector< std::pair< int, double > > pairs; - for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { - pairs.push_back( std::make_pair( crs.row_index[ k ], crs.values[ k ] ) ); + // re-sort A, if enabled + sptrsv.is_sorted = true; + if( config::tuning::SpTRSV::sortingMode == 2 ) { + // CRS first + { + const auto &crs = internal::getCRS( A ); + const size_t nThreads = A.m < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) + for( size_t i = 0; i < A.m; ++i ) { + std::vector< std::pair< int, double > > pairs; + for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { + pairs.push_back( std::make_pair( crs.row_index[ k ], crs.values[ k ] ) ); + } + std::sort( pairs.begin(), pairs.end(), + []( const std::pair< int, double > &left, const std::pair< int, double > &right ) { + return left.first < right.first; + } ); + auto it = pairs.cbegin(); + for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) { + assert( it != pairs.cend() ); + crs.row_index[ k ] = it->first; + crs.values[ k ] = it->second; + } } - std::sort( pairs.begin(), pairs.end(), - []( const std::pair< int, double > &left, const std::pair< int, double > &right ) { - return left.first < right.first; - } ); - auto it = pairs.cbegin(); - for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) { - assert( it != pairs.cend() ); - crs.row_index[ k ] = it->first; - crs.values[ k ] = it->second; + } + // then CCS + { + const auto &ccs = internal::getCCS( A ); + const size_t nThreads = A.n < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) + for( size_t i = 0; i < A.n; ++i ) { + std::vector< std::pair< int, double > > pairs; + for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k ) { + pairs.push_back( std::make_pair( ccs.row_index[ k ], ccs.values[ k ] ) ); + } + std::sort( pairs.begin(), pairs.end(), + []( const std::pair< int, double > &left, const std::pair< int, double > &right ) { + return left.first < right.first; + } ); + auto it = pairs.cbegin(); + for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k, ++it ) { + assert( it != pairs.cend() ); + ccs.row_index[ k ] = it->first; + ccs.values[ k ] = it->second; + } } } - } - // then CCS - { - const auto &ccs = internal::getCCS( A ); - const size_t nThreads = A.n < config::OMP::minLoopSize() - ? 1 - : std::min( - config::OMP::threads(), - A.nz / config::CACHE_LINE_SIZE::value() - ); - #pragma omp parallel for num_threads( nThreads ) - for( size_t i = 0; i < A.n; ++i ) { - std::vector< std::pair< int, double > > pairs; - for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k ) { - pairs.push_back( std::make_pair( ccs.row_index[ k ], ccs.values[ k ] ) ); + } else if( config::tuning::SpTRSV::sortingMode == 1 ) { + { + const auto &crs = internal::getCRS( A ); + const size_t nThreads = A.m < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) + for( size_t i = 0; i < A.m; ++i ) { + const auto index_it = std::find( + crs.row_index + crs.col_start[ i ], + crs.row_index + crs.col_start[ i + 1 ], + i + ); + if( !std::is_void< InputType >::value ) { + const size_t index = std::distance( crs.row_index, index_it ); + std::swap( crs.values[ index ], crs.values[ crs.col_start[ i ] ] ); + } + std::swap( *index_it, crs.row_index[ crs.col_start[ i ] ] ); } - std::sort( pairs.begin(), pairs.end(), - []( const std::pair< int, double > &left, const std::pair< int, double > &right ) { - return left.first < right.first; - } ); - auto it = pairs.cbegin(); - for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k, ++it ) { - assert( it != pairs.cend() ); - ccs.row_index[ k ] = it->first; - ccs.values[ k ] = it->second; + } + { + const auto &ccs = internal::getCCS( A ); + const size_t nThreads = A.n < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) + for( size_t i = 0; i < A.n; ++i ) { + const auto index_it = std::find( + ccs.row_index + ccs.col_start[ i ], + ccs.row_index + ccs.col_start[ i + 1 ], + i + ); + if( !std::is_void< InputType >::value ) { + const size_t index = std::distance( ccs.row_index, index_it ); + std::swap( ccs.values[ index ], ccs.values[ ccs.col_start[ i ] ] ); + } + std::swap( *index_it, ccs.row_index[ ccs.col_start[ i ] ] ); } } + } else { + sptrsv.is_sorted = false; } sptrsv.is_simple = true; From e23d8ee9f54897b2454ef9dadc9545f6e9ba5e93 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 17 Apr 2025 16:53:11 +0200 Subject: [PATCH 52/55] Make unsorted the default (no sorting overhead --- include/graphblas/reference/config.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/graphblas/reference/config.hpp b/include/graphblas/reference/config.hpp index 1d9a8ebbf..116ed0202 100644 --- a/include/graphblas/reference/config.hpp +++ b/include/graphblas/reference/config.hpp @@ -354,7 +354,7 @@ namespace grb { * also applied to the CCS (in which case the above three points apply with * `rows' substituted by `columns'). */ - static constexpr int sortingMode = 2; + static constexpr int sortingMode = 0; }; From fd1ce27c9c79e23afd2b350282902e627ce584f2 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 17 Apr 2025 16:53:24 +0200 Subject: [PATCH 53/55] Make forward a template parameter rather than a run-time switch --- include/graphblas/reference/blas2.hpp | 120 +++++++++++++++++--------- 1 file changed, 77 insertions(+), 43 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index a623b94b4..a9568ab37 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2224,10 +2224,13 @@ namespace grb { * 1: the matrix rows have diagonal elements at position 0, * 2: the matrix rows are sorted. * + * @tparam forward True: perform forward substitution. + * False: perform backward substitution. + * * \note The above description assumes CRS. */ template< - int sorted, + int sorted, bool forward, typename IOType, typename InputType1, typename IND, typename NIT, class Semiring, class Subtraction @@ -2236,7 +2239,6 @@ namespace grb { const Compressed_Storage< InputType1, IND, NIT > &crs, const size_t &i, IOType *__restrict__ const &x, - const bool &forward, IOType &divBy, const Semiring &semiring, const Subtraction &subtraction @@ -2346,25 +2348,25 @@ namespace grb { } template< - typename IOType, int sorted, + typename IOType, int sorted, bool forward, typename VIT, typename RCIT, typename NIT > inline static IOType getDiagonalEntry( const Compressed_Storage< VIT, RCIT, NIT > &storage, const size_t &i, - const bool &forward, const IOType &one + const IOType &one ) { assert( sorted >= 1 && sorted < 3 ); assert( storage.col_start[ i + 1 ] > storage.col_start[ i ] ); const size_t diag_index = sorted == 1 ? storage.col_start[ i ] : ( forward ? (storage.col_start[ i + 1 ] - 1) : storage.col_start[ i ] ); - assert( storage.col_index[ diag_index ] == i ); + assert( storage.row_index[ diag_index ] == i ); return storage.template getValue( diag_index, one ); } #endif // end ifndef _H_GRB_REFERENCE_OMP_BLAS2 /** \internal Specialised dense unmasked sptrsv implementation, sequential */ template< - Descriptor descr, bool maybe_offset, int sorted, + Descriptor descr, bool maybe_offset, int sorted, bool forward, class Semiring, class Subtraction, class Division, typename IOType, typename InputType1, typename RIT, typename CIT, typename NIT @@ -2373,7 +2375,6 @@ namespace grb { IOType *__restrict__ const &v_raw, const Matrix< InputType1, reference, RIT, CIT, NIT > &T, const size_t &offset, const size_t &n, - const bool &forward, const Semiring &semiring, const Subtraction &subtraction, const Division &division, @@ -2383,10 +2384,6 @@ namespace grb { static_assert( sorted >= 0 && sorted < 3, "Invalid value for sorted; this " "an internal error, please submit a bug report" ); - // dynamic sanity checks - assert( grb::nrows( T ) == n ); - assert( grb::ncols( T ) == n ); - // in dense unmasked, resize is a no-op if( phase == grb::RESIZE ) { return grb::SUCCESS; } @@ -2407,11 +2404,14 @@ namespace grb { // in this case we will auto-detect the diagonal item divBy = semiring.template getZero< IOType >(); } else { - divBy = getDiagonalEntry< IOType, sorted, InputType1, RIT, NIT >( - crs, i, forward, one ); + divBy = getDiagonalEntry< + IOType, sorted, forward, InputType1, RIT, NIT + > ( + crs, i, one + ); } assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); - sptrsv_kernel< sorted >( crs, i, v_raw, forward, divBy, semiring, + sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring, subtraction ); #ifdef _DEBUG std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy @@ -2426,10 +2426,10 @@ namespace grb { if( !sorted ) { divBy = semiring.template getZero< IOType >(); } else { - divBy = getDiagonalEntry< IOType, sorted >( crs, i, forward, one ); + divBy = getDiagonalEntry< IOType, sorted, forward >( crs, i, one ); } assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); - sptrsv_kernel< sorted >( crs, i, v_raw, forward, divBy, semiring, + sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring, subtraction ); #ifdef _DEBUG std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy @@ -2446,10 +2446,10 @@ namespace grb { if( !sorted ) { divBy = semiring.template getZero< IOType >(); } else { - divBy = getDiagonalEntry< IOType, sorted >( ccs, i, forward, one ); + divBy = getDiagonalEntry< IOType, sorted, forward >( ccs, i, one ); } assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); - sptrsv_kernel< sorted >( ccs, i, v_raw, forward, divBy, semiring, + sptrsv_kernel< sorted, forward >( ccs, i, v_raw, divBy, semiring, subtraction ); #ifdef _DEBUG std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy @@ -2463,10 +2463,10 @@ namespace grb { if( !sorted ) { divBy = semiring.template getZero< IOType >(); } else { - divBy = getDiagonalEntry< IOType, sorted >( ccs, i, forward, one ); + divBy = getDiagonalEntry< IOType, sorted, forward >( ccs, i, one ); } assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); - sptrsv_kernel< sorted >( ccs, i, v_raw, forward, divBy, semiring, + sptrsv_kernel< sorted, forward >( ccs, i, v_raw, divBy, semiring, subtraction ); #ifdef _DEBUG std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy @@ -2484,7 +2484,7 @@ namespace grb { #ifdef _H_GRB_REFERENCE_OMP_BLAS2 /** \internal Specialised dense unmasked sptrsv implementation, OpenMP */ template< - Descriptor descr, + Descriptor descr, bool forward, class Semiring, class Subtraction, class Division, typename IOType, typename InputType1, typename RIT, typename CIT, typename NIT @@ -2493,7 +2493,6 @@ namespace grb { IOType *__restrict__ const &v_raw, const Matrix< InputType1, reference, RIT, CIT, NIT > &T, const size_t &n, - const bool &forward, const Semiring &semiring, const Subtraction &subtraction, const Division &division, @@ -2530,9 +2529,9 @@ namespace grb { assert( lo < n ); assert( no + lo <= n ); local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< - descr, true, config::tuning::SpTRSV::sortingMode + descr, true, config::tuning::SpTRSV::sortingMode, forward >( - v_raw, T, lo, no, forward, semiring, subtraction, division, phase + v_raw, T, lo, no, semiring, subtraction, division, phase ); } #pragma omp barrier @@ -2543,9 +2542,9 @@ namespace grb { assert( lo < n ); assert( no + lo <= n ); local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< - descr, true, 0 + descr, true, 0, forward >( - v_raw, T, lo, no, forward, semiring, subtraction, division, phase + v_raw, T, lo, no, semiring, subtraction, division, phase ); } #pragma omp barrier @@ -2573,9 +2572,9 @@ namespace grb { assert( lo < n ); assert( no + lo <= n ); local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< - descr, true, config::tuning::SpTRSV::sortingMode + descr, true, config::tuning::SpTRSV::sortingMode, forward >( - v_raw, T, lo, no, forward, semiring, subtraction, division, phase + v_raw, T, lo, no, semiring, subtraction, division, phase ); } #pragma omp barrier @@ -2588,9 +2587,9 @@ namespace grb { assert( lo < n ); assert( no + lo <= n ); local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< - descr, true, 0 + descr, true, 0, forward >( - v_raw, T, lo, no, forward, semiring, subtraction, division, phase + v_raw, T, lo, no, semiring, subtraction, division, phase ); } #pragma omp barrier @@ -2613,7 +2612,7 @@ namespace grb { * sptrsv. */ template< - Descriptor descr, + Descriptor descr, bool forward, bool masked, bool sparse, class Semiring, class Subtraction, class Division, typename IOType, typename InputType1, typename InputType2, @@ -2624,7 +2623,6 @@ namespace grb { const Vector< InputType2, reference, Coords > &mask, const Matrix< InputType1, reference, RIT, CIT, NIT > &T, const size_t &n, - const bool forward, const Semiring &semiring, const Subtraction &subtraction, const Division &division, @@ -2640,6 +2638,8 @@ namespace grb { assert( grb::nrows( T ) == n ); assert( grb::ncols( T ) == n ); + (void) descr; + (void) forward; (void) masked; (void) sparse; #ifdef NDEBUG @@ -2648,7 +2648,6 @@ namespace grb { (void) T; (void) n; #endif - (void) forward; (void) semiring; (void) subtraction; (void) division; @@ -2707,8 +2706,13 @@ namespace grb { if( dense || grb::nnz( xb ) == n ) { IOType * const xb_p = internal::getRaw( xb ); #ifndef _H_GRB_REFERENCE_OMP_BLAS2 - return internal::dense_unmasked_sequential_sptrsv< descr, false, 0 >( - xb_p, T, 0, n, forward, semiring, subtraction, division, phase ); + if( forward ) { + return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, true >( + xb_p, T, 0, n, semiring, subtraction, division, phase ); + } else { + return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, false >( + xb_p, T, 0, n, semiring, subtraction, division, phase ); + } #else // The below code is only for testing (DBG): #if 0 @@ -2744,14 +2748,28 @@ namespace grb { } return grb::SUCCESS; #else - return internal::dense_unmasked_omp_sptrsv< descr >( - xb_p, T, n, forward, semiring, subtraction, division, phase ); + if( forward ) { + return internal::dense_unmasked_omp_sptrsv< descr, true >( + xb_p, T, n, semiring, subtraction, division, phase ); + } else { + return internal::dense_unmasked_omp_sptrsv< descr, false >( + xb_p, T, n, semiring, subtraction, division, phase ); + } #endif #endif } else { grb::Vector< IOType, reference, Coords > no_mask( 0 ); - return internal::generic_sptrsv< descr, false, true >( xb, no_mask, T, n, - forward, semiring, subtraction, division, phase ); + if( forward ) { + return internal::generic_sptrsv< descr, false, true, true >( + xb, no_mask, T, n, + semiring, subtraction, division, phase + ); + } else { + return internal::generic_sptrsv< descr, false, true, false>( + xb, no_mask, T, n, + semiring, subtraction, division, phase + ); + } } } @@ -2817,11 +2835,27 @@ namespace grb { } if( dense || (grb::nnz( xb ) == n && grb::nnz( mask ) == n) ) { - return internal::generic_sptrsv< descr, true, false >( xb, mask, T, n, - forward, semiring, subtraction, division, phase ); + if( forward ) { + return internal::generic_sptrsv< descr, true, false, true >( + xb, mask, T, n, + semiring, subtraction, division, phase + ); + } else { + return internal::generic_sptrsv< descr, true, false, false >( + xb, mask, T, n, + semiring, subtraction, division, phase + ); + } } else { - return internal::generic_sptrsv< descr, true, true >( xb, mask, T, n, - forward, semiring, subtraction, division, phase ); + if( forward ) { + return internal::generic_sptrsv< descr, true, true, true >( + xb, mask, T, n, + semiring, subtraction, division, phase ); + } else { + return internal::generic_sptrsv< descr, true, true, false >( + xb, mask, T, n, + semiring, subtraction, division, phase ); + } } } From a346aa17295a3454033959fd2351d2869367f0b2 Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 17 Apr 2025 19:09:29 +0200 Subject: [PATCH 54/55] Optimisations and bugfixes --- include/graphblas/reference/blas2.hpp | 63 +++++++++++++++------------ 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index a9568ab37..41fac2d0b 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2368,13 +2368,13 @@ namespace grb { template< Descriptor descr, bool maybe_offset, int sorted, bool forward, class Semiring, class Subtraction, class Division, - typename IOType, typename InputType1, + typename IOPtrType, typename InputType1, typename RIT, typename CIT, typename NIT > RC dense_unmasked_sequential_sptrsv( - IOType *__restrict__ const &v_raw, + const IOPtrType &v_raw, const Matrix< InputType1, reference, RIT, CIT, NIT > &T, - const size_t &offset, const size_t &n, + const NIT &offset, const NIT &n, const Semiring &semiring, const Subtraction &subtraction, const Division &division, @@ -2384,11 +2384,16 @@ namespace grb { static_assert( sorted >= 0 && sorted < 3, "Invalid value for sorted; this " "an internal error, please submit a bug report" ); - // in dense unmasked, resize is a no-op - if( phase == grb::RESIZE ) { return grb::SUCCESS; } - // only execute and resize are supported assert( phase == grb::EXECUTE ); +#ifdef NDEBUG + (void) phase; +#endif + + // get value type from the pointer + typedef typename std::remove_reference< + decltype( *std::declval< IOPtrType >() ) + >::type IOType; // in case of pattern matrices, get 1 from the semiring const IOType one = semiring.getMultiplicativeMonoid().template @@ -2404,17 +2409,13 @@ namespace grb { // in this case we will auto-detect the diagonal item divBy = semiring.template getZero< IOType >(); } else { - divBy = getDiagonalEntry< - IOType, sorted, forward, InputType1, RIT, NIT - > ( - crs, i, one - ); + divBy = getDiagonalEntry< IOType, sorted, forward >( crs, i, one ); } assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring, subtraction ); #ifdef _DEBUG - std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + std::cout << "\trow v_raw[ " << i << " ] = " << v_raw[ i ] << " will be normalised with " << divBy << "\n"; #endif (void) grb::foldl( v_raw[ i ], divBy, division ); @@ -2432,7 +2433,7 @@ namespace grb { sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring, subtraction ); #ifdef _DEBUG - std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + std::cout << "\trow v_raw[ " << i << " ] = " << v_raw[ i ] << " will be normalised with " << divBy << "\n"; #endif (void) grb::foldl( v_raw[ i ], divBy, division ); @@ -2490,7 +2491,7 @@ namespace grb { typename RIT, typename CIT, typename NIT > RC dense_unmasked_omp_sptrsv( - IOType *__restrict__ const &v_raw, + IOType * const &v_raw, const Matrix< InputType1, reference, RIT, CIT, NIT > &T, const size_t &n, const Semiring &semiring, @@ -2524,8 +2525,8 @@ namespace grb { assert( data != nullptr ); if( sptrsv.is_sorted ) { for( size_t i = 0; i < sptrsv.supersteps; ++i ) { - const size_t lo = static_cast< size_t >( *data++ ); - const size_t no = static_cast< size_t >( *data++ ); + const auto &lo = *data++; + const auto &no = *data++; assert( lo < n ); assert( no + lo <= n ); local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< @@ -2533,12 +2534,12 @@ namespace grb { >( v_raw, T, lo, no, semiring, subtraction, division, phase ); + #pragma omp barrier } - #pragma omp barrier } else { for( size_t i = 0; i < sptrsv.supersteps; ++i ) { - const size_t lo = static_cast< size_t >( *data++ ); - const size_t no = static_cast< size_t >( *data++ ); + const NIT &lo = *data++; + const NIT &no = *data++; assert( lo < n ); assert( no + lo <= n ); local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< @@ -2546,8 +2547,8 @@ namespace grb { >( v_raw, T, lo, no, semiring, subtraction, division, phase ); + #pragma omp barrier } - #pragma omp barrier } if( local_rc != grb::SUCCESS ) { #pragma omp atomic write @@ -2567,8 +2568,8 @@ namespace grb { if( sptrsv.is_sorted ) { for( size_t i = 0; i < sptrsv.supersteps; ++i ) { for( size_t k = 0; k < end[ i ]; ++k ) { - const size_t &lo = static_cast< size_t >( *data++ ); - const size_t &no = static_cast< size_t >( *data++ ); + const NIT &lo = *data++; + const NIT &no = *data++; assert( lo < n ); assert( no + lo <= n ); local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< @@ -2582,8 +2583,8 @@ namespace grb { } else { for( size_t i = 0; i < sptrsv.supersteps; ++i ) { for( size_t k = 0; k < end[ i ]; ++k ) { - const size_t &lo = static_cast< size_t >( *data++ ); - const size_t &no = static_cast< size_t >( *data++ ); + const NIT &lo = *data++; + const NIT &no = *data++; assert( lo < n ); assert( no + lo <= n ); local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< @@ -2702,21 +2703,26 @@ namespace grb { // check trivial if( n == 0 ) { return grb::SUCCESS; } + // in the dense variant, resize does nothing + if( phase == grb::RESIZE ) { return grb::SUCCESS; } + // check dense dispatch if( dense || grb::nnz( xb ) == n ) { - IOType * const xb_p = internal::getRaw( xb ); #ifndef _H_GRB_REFERENCE_OMP_BLAS2 + IOType *__restrict__ const xb_p = internal::getRaw( xb ); + const NIT lo = 0; + const NIT no = n; if( forward ) { return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, true >( - xb_p, T, 0, n, semiring, subtraction, division, phase ); + xb_p, T, lo, no, semiring, subtraction, division, phase ); } else { return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, false >( - xb_p, T, 0, n, semiring, subtraction, division, phase ); + xb_p, T, lo, no, semiring, subtraction, division, phase ); } #else + IOType * const xb_p = internal::getRaw( xb ); // The below code is only for testing (DBG): #if 0 - if( phase == grb::RESIZE ) { return grb::SUCCESS; } const auto sptrsv = internal::getSptrsvData( T ); #pragma omp parallel num_threads(sptrsv->nThreads) { @@ -2741,6 +2747,7 @@ namespace grb { xb_p[ row_idx ] -= value * xb_p[ j ]; //} } + #pragma omp critical xb_p[ row_idx ] /= crs.values[ crs.col_start[ row_idx + 1 ] - 1 ]; } #pragma omp barrier From 763b89f6232dc61302f03044691f96b642fd1b4c Mon Sep 17 00:00:00 2001 From: "Albert-Jan N. Yzelman" Date: Thu, 17 Apr 2025 19:09:46 +0200 Subject: [PATCH 55/55] More robust implementation of sorting mode 1 --- include/graphblas/reference/matrix.hpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index 6b7cb29ba..1b99614ef 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -412,16 +412,18 @@ namespace grb { ); #pragma omp parallel for num_threads( nThreads ) for( size_t i = 0; i < A.n; ++i ) { - const auto index_it = std::find( - ccs.row_index + ccs.col_start[ i ], - ccs.row_index + ccs.col_start[ i + 1 ], - i - ); - if( !std::is_void< InputType >::value ) { - const size_t index = std::distance( ccs.row_index, index_it ); - std::swap( ccs.values[ index ], ccs.values[ ccs.col_start[ i ] ] ); + const auto start = ccs.row_index + ccs.col_start[ i ]; + const auto end = ccs.row_index + ccs.col_start[ i + 1 ]; + const auto index_it = std::find( start, end, i ); + if( index_it == end ) { + throw std::runtime_error( "No diagonal element present" ); + } else if( index_it != start ) { + if( !std::is_void< InputType >::value ) { + const size_t index = std::distance( ccs.row_index, index_it ); + std::swap( ccs.values[ index ], ccs.values[ ccs.col_start[ i ] ] ); + } + std::swap( *index_it, ccs.row_index[ ccs.col_start[ i ] ] ); } - std::swap( *index_it, ccs.row_index[ ccs.col_start[ i ] ] ); } } } else {