diff --git a/include/graphblas/base/blas2.hpp b/include/graphblas/base/blas2.hpp index 3b5e5df4e..2d61d46d4 100644 --- a/include/graphblas/base/blas2.hpp +++ b/include/graphblas/base/blas2.hpp @@ -337,6 +337,149 @@ namespace grb { return UNSUPPORTED; } + /** + * Computes \f$ x \f$ from \f$ Tx=b \f$. + * + * \warning This is an experimental feature. + * + * Here, \f$ T, b \f$ are given while \f$ T \f$ additionally must be either + * lower- or upper-triangular. The output \f$ x \f$ may furthermore be masked. + * + * @param[in,out] xb On input: the vector b. On output: the vector x. + * When #grb::EXECUTE is given this vector must have + * sufficient capacity to store both any pre-existing + * nonzeroes in \f$ b \f$ plus the nonzeroes in the (masked) + * result of \f$ x \f$. + * When #grb::RESIZE is given, the vector contents are + * unchanged, however its capacity may be enlarged in order + * to ensure the above-described condition. + * + * \note On input, \a xb may contain both implicit and explicit zeroes. + * + * \note On output, \a xb may contain both implicit and explicit zeroes, even + * if on input it did not. + * + * @param[in] mask The mask that acts on \f$ x \f$. This vector has either + * size zero or size equal to that of \a x. + * + * \note If \a mask has size zero, it is interpreted as though the operation + * unmasked. + * + * @param[in] T The upper- or lower-triangular input matrix. This must be + * a square matrix with size equal to that of \a xb. + * @param[in] forward Whether to perform forward substitution (i.e., \f$ T \f$ + * is lower-triangular), or to perform backward substitution + * instead (i.e., \f$ T \f$ is upper-triangular). + * + * \note With the structural information ALP/Dense passes as template + * information, \a forward would not be required. + * + * @param[in] semiring The semiring under which to perform the SpTrsv. + * @param[in] subtraction The inverse of the additive operator of \a semiring. + * @param[in] division The inverse of the multiplicative operator of + * \a semiring. + * + * \note That is, this operation in fact requires a field structure and not a + * semiring. A future extension to ALP may provide such a structure + * explicitly. + * + * @param[in] phase The requested phase of the computation. Only + * #grb::EXECUTE and #grb::RESIZE are supported. + * + * \todo Expand documentation. + */ + template< + Descriptor descr = descriptors::no_operation, + class Semiring, + class Subtraction, + class Division, + typename IOType, typename InputType1, typename InputType2, + typename Coords, typename RIT, typename CIT, typename NIT, + Backend backend + > + RC sptrsv( + Vector< IOType, backend, Coords > &xb, + const Vector< InputType2, backend, Coords > &mask, + const Matrix< InputType1, backend, RIT, CIT, NIT > &T, + const bool forward, + const Semiring &semiring = Semiring(), + const Subtraction &subtraction = Subtraction(), + const Division &division = Division(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Semiring >::value && + grb::is_operator< Subtraction >::value && + grb::is_operator< Division >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value, + void >::type * const = nullptr + ) { +#ifdef _DEBUG + std::cerr << "Selected backend does not implement masked grb::sptrsv\n"; +#endif +#ifndef NDEBUG + const bool selected_backend_does_not_support_masked_sptrsv = false; + assert( selected_backend_does_not_support_masked_sptrsv ); +#endif + (void) xb; + (void) mask; + (void) T; + (void) semiring; + (void) subtraction; + (void) division; + (void) phase; + return UNSUPPORTED; + } + + /** + * Computes \f$ x \f$ from \f$ Tx = b \f$, unmasked variant. + * + * \warning This is an experimental feature. + * + * \todo Extend documentation + */ + template< + Descriptor descr = descriptors::no_operation, + class Semiring, + class Subtraction, + class Division, + typename IOType, typename InputType1, + typename Coords, typename RIT, typename CIT, typename NIT, + Backend backend + > + RC sptrsv( + Vector< IOType, backend, Coords > &xb, + const Matrix< InputType1, backend, RIT, CIT, NIT > &T, + const bool forward, + const Semiring &semiring = Semiring(), + const Subtraction &subtraction = Subtraction(), + const Division &division = Division(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Semiring >::value && + grb::is_operator< Subtraction >::value && + grb::is_operator< Division >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value, + void >::type * const = nullptr + ) { +#ifdef _DEBUG + std::cerr << "Selected backend does not implement masked grb::sptrsv\n"; +#endif +#ifndef NDEBUG + const bool selected_backend_does_not_support_unmasked_sptrsv = false; + assert( selected_backend_does_not_support_unmasked_sptrsv ); +#endif + (void) xb; + (void) T; + (void) semiring; + (void) subtraction; + (void) division; + (void) phase; + return UNSUPPORTED; + } + /** * Executes an arbitrary element-wise user-defined function \a f on all * nonzero elements of a given matrix \a A. diff --git a/include/graphblas/reference/SptrsvSchedule.hpp b/include/graphblas/reference/SptrsvSchedule.hpp new file mode 100644 index 000000000..ea25e8229 --- /dev/null +++ b/include/graphblas/reference/SptrsvSchedule.hpp @@ -0,0 +1,236 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @author A. N. Yzelman + * @date 4th of April, 2025 + */ + +#ifndef _H_GRB_REFERENCE_SPTRSVSCHEDULE +#define _H_GRB_REFERENCE_SPTRSVSCHEDULE + +#include +#include + +#include + + +namespace grb { + + namespace internal { + + template< typename NIT > + class SptrsvSchedule { + + private: + + /** One deleter for each chunk of thread-local data. */ + std::vector< utils::AutoDeleter< char > > _deleters; + + /** + * Fixed-size buffer for realising the default schedule. + * + * The first two positions are reserved for the range of the default + * schedule, while the last position is reserved for the end-position of + * the default schedule. + */ + NIT default_schedule[ 3 ]; + + /** Computes and initialises a trivial schedule. */ + void initTrivial( const NIT n ) { + // dynamic sanity check + assert( data[ 0 ] == nullptr ); + // set trivial schedule + default_schedule[ 0 ] = 0; + default_schedule[ 1 ] = n; + default_schedule[ 2 ] = 1; + nRanges[ 0 ] = 1; + // set pointers to trivial schedule + data[ 0 ] = reinterpret_cast< char * >( &(default_schedule[0]) ); + endPositions[ 0 ] = reinterpret_cast< char * >( &(default_schedule[2]) ); + // verify trivial schedule +#ifndef NDEBUG + { + NIT * const interpreted = reinterpret_cast< NIT * >(data[0]); + assert( interpreted[ 0 ] == 0 ); + assert( interpreted[ 1 ] == n ); + } + { + NIT * const interpreted = reinterpret_cast< NIT * >(endPositions[0]); + assert( *interpreted == 1 ); + } +#endif + // note that this is a trivial schedule + is_simple = true; + + // the default schedule does not (cannot) sort input matrices + is_sorted = false; + } + + /** + * Implementation of move construction and assignment. + */ + void moveImpl( SptrsvSchedule &&toMove ) { + _deleters = std::move( toMove._deleters ); + default_schedule = std::move( toMove.default_schedule ); + is_sorted = toMove.is_sorted; + is_simple = toMove.is_simple; + supersteps = toMove.supersteps; + nThreads = toMove.nThreads; + data = std::move( toMove.data ); + endPositions = std::move( toMove.endPositions ); + nRanges = std::move( toMove.nRanges ); + toMove.is_sorted = true; + toMove.is_simple = false; + toMove.supersteps = 0; + toMove.nThreads = 0; + } + + + public: + + /** Whether the matrix is sorted as part of tuning. */ + bool is_sorted; + + /** Whether the schedule is simple.*/ + bool is_simple; + + /** Number of schedule steps. */ + size_t supersteps; + + /** + * The number of threads the schedule is designed for. + */ + size_t nThreads; + + /** One data pointer per thread. */ + std::vector< char * > data; + + /** One end-position array per thread. */ + std::vector< char * > endPositions; + + /** The total number of ranges per thread. */ + std::vector< NIT > nRanges; + + /** Move constructor. */ + SptrsvSchedule( SptrsvSchedule &&toMove ) { + moveImpl( toMove ); + } + + /** + * Base constructor. + * + * @param[in] n The size of the linear system. + * + * Initialises the schedule to the trivial always-valid schedule, which is + * a single-thread single-superstep schedule that assigns all work to the + * first (and only) thread. + */ + SptrsvSchedule( const NIT n ) : + _deleters( 1 ), is_sorted( true ), is_simple( false ), + supersteps( 1 ), nThreads( 1 ), + data( 1 ), endPositions( 1 ), nRanges( 1 ) + { + data[ 0 ] = nullptr; + endPositions[ 0 ] = nullptr; + initTrivial( n ); + } + + /** + * Specialised constructor for multiple threads. + * + * @param[in] n The size of the linear system. + * @param[in] T The number of threads a schedule should be created for. + * + * Initialises the schedule to the trivial always-valid schedule, which is + * a single-superstep schedule that assigns all work to the first thread. + */ + SptrsvSchedule( const NIT n, const size_t T ) : + _deleters( 2 * T ), is_sorted( true ), is_simple( false ), + supersteps( 1 ), nThreads( T ), + data( T, nullptr ), endPositions( T, nullptr ), nRanges( T, 0 ) + { + if( T == 0 || T > std::numeric_limits< int >::max() ) { + throw std::runtime_error( "Invalid number of threads" ); + } + initTrivial( n ); + } + + /** + * Base destructor. + * + * Dynamic memory is freed through the #_deleters. + */ + ~SptrsvSchedule() { + // sanity checks only + assert( nThreads != 0 ); + if( nThreads == 1 ) { + assert( supersteps == 1 ); + } + } + + /** Move assignment. */ + SptrsvSchedule& operator=( SptrsvSchedule &&toMove ) { + moveImpl( toMove ); + return *this; + } + + /** + * Allocates a thread-local chunk of data. + * + * Must be called from within the thread that will use it(!) + */ + void alloc( const size_t s, const size_t nRanges_in ) { + assert( s < nThreads ); + assert( supersteps > 0 ); + assert( _deleters.size() >= s ); + assert( data.size() >= s ); + assert( data[ s ] == nullptr ); + assert( endPositions[ s ] == nullptr ); + if( nRanges[ s ] != nRanges_in ) { + throw std::runtime_error( "Given nRanges differs from the one given at " + "allocation time" ); + } + grb::RC rc = grb::SUCCESS; + if( nRanges_in > 0 ) { + rc = utils::alloc( + "grb::internal::SptrsvSchedule (default constructor)", + "default thread-local data allocation, variant I", + data[ s ], 2 * nRanges_in * sizeof(NIT), false, _deleters[ s ], + endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + nThreads ] + ); + } else { + data[ s ] = nullptr; + rc = utils::alloc( + "grb::internal::SptrsvSchedule (default constructor)", + "default thread-local data allocation, variant II", + endPositions[ s ], supersteps * sizeof( NIT ), false, _deleters[ s + nThreads ] + ); + } + if( rc != grb::SUCCESS ) { + throw std::bad_alloc(); + } + } + + }; + + } // end namespace `grb::internal' + +} // end namespace `grb' + +#endif // _H_GRB_REFERENCE_SPTRSVSCHEDULE + diff --git a/include/graphblas/reference/blas2.hpp b/include/graphblas/reference/blas2.hpp index 7a4d3e135..41fac2d0b 100644 --- a/include/graphblas/reference/blas2.hpp +++ b/include/graphblas/reference/blas2.hpp @@ -2216,6 +2216,658 @@ namespace grb { namespace internal { +#ifndef _H_GRB_REFERENCE_OMP_BLAS2 + /** + * Backend-independent sptrsv kernel code. + * + * @tparam sorted 0: the matrix rows are not sorted, + * 1: the matrix rows have diagonal elements at position 0, + * 2: the matrix rows are sorted. + * + * @tparam forward True: perform forward substitution. + * False: perform backward substitution. + * + * \note The above description assumes CRS. + */ + template< + int sorted, bool forward, + typename IOType, typename InputType1, + typename IND, typename NIT, + class Semiring, class Subtraction + > + inline void sptrsv_kernel( + const Compressed_Storage< InputType1, IND, NIT > &crs, + const size_t &i, + IOType *__restrict__ const &x, + IOType &divBy, + const Semiring &semiring, + const Subtraction &subtraction + ) { + // static assertions + static_assert( sorted >= 0 && sorted < 3, "Illegal value for sorted; this " + "is an internal error, please submit a bug report" ); + constexpr auto one = + Semiring::template One< typename Semiring::D1 >::value(); + + // NOTE: explicit vectorisation seems to cause significant slowdowns. + // disabled but retained for future reference / more successful + // optimisation + /* + // constants + constexpr size_t ind_blocksize = + grb::config::SIMD_BLOCKSIZE< IND >::value(); + constexpr size_t blocksize = ind_blocksize < Semiring::blocksize + ? ind_blocksize : Semiring::blocksize; + // constant bufer + IND cur_ind[ blocksize ]; + // SIMD set + for( size_t r = 0; r < blocksize; ++r ) { + cur_ind[ r ] = i; + } + for( ; k + Semiring::blocksize < crs.col_start[ i + 1 ]; + k += Semiring::blocksize + ) { + typename Semiring::D1 values[ blocksize ]; + typename Semiring::D3 tmp[ blocksize ]; + IOType x_simd[ blocksize ]; + IND indices[ blocksize ]; + bool match[ blocksize ]; + // streaming loads + for( size_t r = 0; r < blocksize; ++r ) { + values[ r ] = crs.template getValue( k + r, one ); + } + for( size_t r = 0; r < blocksize; ++r ) { + indices[ r ] = crs.row_index[ k + r ]; + } + // SIMD compute + for( size_t r = 0; r < blocksize; ++r ) { + match[ r ] = cur_ind[ r ] != indices[ r ]; + } + // gather + for( size_t r = 0; r < blocksize; ++r ) { + if( match[ r ] ) { + x_simd[ r ] = x[ indices[ r ] ]; + } + } + // masked SIMD compute + for( size_t r = 0; r < blocksize; ++r ) { + if( match[ r ] ) { + tmp[ r ] = values[ r ] * x_simd[ r ]; + } + } + // masked SIMD reduce + for( size_t r = 0; r < blocksize; ++r ) { + if( match[ r ] ) { + reduced += tmp[ r ]; + } + } + // diagonal detection; SIMD invert and reduce + for( size_t r = 0; r < blocksize; ++r ) { + match[ r ] = !match[ r ]; + } + for( size_t r = 0; r < blocksize; ++r ) { + if( match[ r ] ) { + divBy += values[ r ]; + } + } + }*/ + + if( sorted ) { + (void) divBy; + } + assert( crs.col_start[ i + 1 ] > crs.col_start[ i ] ); + const size_t start = (!sorted) ? crs.col_start[ i ] : ( + (sorted == 1) ? (crs.col_start[ i ] + 1) : ( + forward ? crs.col_start[ i ] : (crs.col_start[ i ] + 1) ) + ); + const size_t end = (!sorted) ? crs.col_start[ i + 1 ] : ( + (sorted == 1) ? crs.col_start[ i + 1 ] : ( + forward ? (crs.col_start[ i + 1 ] - 1) : crs.col_start[ i + 1 ] ) + ); + for( size_t k = start; k < end; ++k ) { + const typename Semiring::D1 val = crs.template getValue( k, one ); + const auto &ind = crs.row_index[ k ]; + if( !sorted ) { + if( static_cast< size_t >(ind) == i ) { + #ifdef _DEBUG + std::cout << "\tskipping nonzero at " << i ", " << i << "\n"; + #endif + divBy = val; + continue; + } + } + #ifdef _DEBUG + std::cout << "\t x[ " << i << " ] (" << x[i] << ") -= " << val << "* x[ " + << ind << " ] (" << x[ind] << ")\n"; + #endif + typename Semiring::D3 tmp; + (void) grb::apply( tmp, val, x[ ind ], + semiring.getMultiplicativeOperator() ); + (void) grb::foldl( x[ i ], tmp, subtraction ); + } + } + + template< + typename IOType, int sorted, bool forward, + typename VIT, typename RCIT, typename NIT + > + inline static IOType getDiagonalEntry( + const Compressed_Storage< VIT, RCIT, NIT > &storage, const size_t &i, + const IOType &one + ) { + assert( sorted >= 1 && sorted < 3 ); + assert( storage.col_start[ i + 1 ] > storage.col_start[ i ] ); + const size_t diag_index = sorted == 1 ? storage.col_start[ i ] : ( + forward ? (storage.col_start[ i + 1 ] - 1) : storage.col_start[ i ] ); + assert( storage.row_index[ diag_index ] == i ); + return storage.template getValue( diag_index, one ); + } +#endif // end ifndef _H_GRB_REFERENCE_OMP_BLAS2 + + /** \internal Specialised dense unmasked sptrsv implementation, sequential */ + template< + Descriptor descr, bool maybe_offset, int sorted, bool forward, + class Semiring, class Subtraction, class Division, + typename IOPtrType, typename InputType1, + typename RIT, typename CIT, typename NIT + > + RC dense_unmasked_sequential_sptrsv( + const IOPtrType &v_raw, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const NIT &offset, const NIT &n, + const Semiring &semiring, + const Subtraction &subtraction, + const Division &division, + const Phase &phase + ) { + // static sanity checks + static_assert( sorted >= 0 && sorted < 3, "Invalid value for sorted; this " + "an internal error, please submit a bug report" ); + + // only execute and resize are supported + assert( phase == grb::EXECUTE ); +#ifdef NDEBUG + (void) phase; +#endif + + // get value type from the pointer + typedef typename std::remove_reference< + decltype( *std::declval< IOPtrType >() ) + >::type IOType; + + // in case of pattern matrices, get 1 from the semiring + const IOType one = semiring.getMultiplicativeMonoid().template + getIdentity< IOType >(); + + // switch forward or backward solve + if( forward ) { + const auto &crs = internal::getCRS( T ); + if( !maybe_offset || offset == 0 ) { + for( size_t i = 0; i < n; ++i ) { + IOType divBy; + if( !sorted ) { + // in this case we will auto-detect the diagonal item + divBy = semiring.template getZero< IOType >(); + } else { + divBy = getDiagonalEntry< IOType, sorted, forward >( crs, i, one ); + } + assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); + sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring, + subtraction ); +#ifdef _DEBUG + std::cout << "\trow v_raw[ " << i << " ] = " << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; +#endif + (void) grb::foldl( v_raw[ i ], divBy, division ); + } + } else { + const size_t end = n + offset; + for( size_t i = offset; i < end; ++i ) { + IOType divBy; + if( !sorted ) { + divBy = semiring.template getZero< IOType >(); + } else { + divBy = getDiagonalEntry< IOType, sorted, forward >( crs, i, one ); + } + assert( crs.col_start[ i ] <= crs.col_start[ i + 1 ] ); + sptrsv_kernel< sorted, forward >( crs, i, v_raw, divBy, semiring, + subtraction ); +#ifdef _DEBUG + std::cout << "\trow v_raw[ " << i << " ] = " << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; +#endif + (void) grb::foldl( v_raw[ i ], divBy, division ); + } + } + } else { + const auto &ccs = internal::getCCS( T ); + if( !maybe_offset || offset == 0 ) { + for( size_t i = n - 1; i < n; --i ) { + IOType divBy; + if( !sorted ) { + divBy = semiring.template getZero< IOType >(); + } else { + divBy = getDiagonalEntry< IOType, sorted, forward >( ccs, i, one ); + } + assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); + sptrsv_kernel< sorted, forward >( ccs, i, v_raw, divBy, semiring, + subtraction ); +#ifdef _DEBUG + std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; +#endif + (void) grb::foldl( v_raw[ i ], divBy, division ); + } + } else { + for( size_t i = n - 1 + offset; i >= offset; --i ) { + IOType divBy; + if( !sorted ) { + divBy = semiring.template getZero< IOType >(); + } else { + divBy = getDiagonalEntry< IOType, sorted, forward >( ccs, i, one ); + } + assert( ccs.col_start[ i ] <= ccs.col_start[ i + 1 ] ); + sptrsv_kernel< sorted, forward >( ccs, i, v_raw, divBy, semiring, + subtraction ); +#ifdef _DEBUG + std::cout << "\t" << v_raw[ i ] << " will be normalised with " << divBy + << "\n"; +#endif + (void) grb::foldl( v_raw[ i ], divBy, division ); + } + } + } + + // done + return grb::SUCCESS; + } + +#ifdef _H_GRB_REFERENCE_OMP_BLAS2 + /** \internal Specialised dense unmasked sptrsv implementation, OpenMP */ + template< + Descriptor descr, bool forward, + class Semiring, class Subtraction, class Division, + typename IOType, typename InputType1, + typename RIT, typename CIT, typename NIT + > + RC dense_unmasked_omp_sptrsv( + IOType * const &v_raw, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const size_t &n, + const Semiring &semiring, + const Subtraction &subtraction, + const Division &division, + const Phase &phase + ) { + // in dense unmasked, resize is a no-op + if( phase == grb::RESIZE ) { return grb::SUCCESS; } + + // get SpTrsv schedule + const SptrsvSchedule< NIT > *sptrsvSchedule_p = + internal::getSptrsvData( T ); + const SptrsvSchedule< NIT > default_schedule (n); + const SptrsvSchedule< NIT > &sptrsv = sptrsvSchedule_p == nullptr ? + default_schedule : *sptrsvSchedule_p; + + // only execute and resize are supported + assert( phase == grb::EXECUTE ); + + // start parallel section + grb::RC ret = grb::SUCCESS; + + if( sptrsv.is_simple ) { + #pragma omp parallel num_threads(sptrsv.nThreads) + { + grb::RC local_rc = grb::SUCCESS; + const int s = omp_get_thread_num(); + const NIT *__restrict__ data = + reinterpret_cast< const NIT * >(sptrsv.data[ s ]); + assert( data != nullptr ); + if( sptrsv.is_sorted ) { + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + const auto &lo = *data++; + const auto &no = *data++; + assert( lo < n ); + assert( no + lo <= n ); + local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< + descr, true, config::tuning::SpTRSV::sortingMode, forward + >( + v_raw, T, lo, no, semiring, subtraction, division, phase + ); + #pragma omp barrier + } + } else { + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + const NIT &lo = *data++; + const NIT &no = *data++; + assert( lo < n ); + assert( no + lo <= n ); + local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< + descr, true, 0, forward + >( + v_raw, T, lo, no, semiring, subtraction, division, phase + ); + #pragma omp barrier + } + } + if( local_rc != grb::SUCCESS ) { + #pragma omp atomic write + ret = local_rc; + } + } + } else { + #pragma omp parallel num_threads(sptrsv.nThreads) + { + grb::RC local_rc = grb::SUCCESS; + const int s = omp_get_thread_num(); + const NIT *__restrict__ data = + reinterpret_cast< const NIT * >(sptrsv.data[ s ]); + const NIT *__restrict__ const end = + reinterpret_cast< const NIT * >(sptrsv.endPositions[ s ]); + assert( end != nullptr ); + if( sptrsv.is_sorted ) { + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + for( size_t k = 0; k < end[ i ]; ++k ) { + const NIT &lo = *data++; + const NIT &no = *data++; + assert( lo < n ); + assert( no + lo <= n ); + local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< + descr, true, config::tuning::SpTRSV::sortingMode, forward + >( + v_raw, T, lo, no, semiring, subtraction, division, phase + ); + } + #pragma omp barrier + } + } else { + for( size_t i = 0; i < sptrsv.supersteps; ++i ) { + for( size_t k = 0; k < end[ i ]; ++k ) { + const NIT &lo = *data++; + const NIT &no = *data++; + assert( lo < n ); + assert( no + lo <= n ); + local_rc = local_rc ? local_rc : dense_unmasked_sequential_sptrsv< + descr, true, 0, forward + >( + v_raw, T, lo, no, semiring, subtraction, division, phase + ); + } + #pragma omp barrier + } + } + if( local_rc != grb::SUCCESS ) { + #pragma omp atomic write + ret = local_rc; + } + } + } + + // done + return ret; + } +#endif + + /** + * \internal Implements sparse masked, sparse unmasked, and dense masked + * sptrsv. + */ + template< + Descriptor descr, bool forward, + bool masked, bool sparse, + class Semiring, class Subtraction, class Division, + typename IOType, typename InputType1, typename InputType2, + typename Coords, typename RIT, typename CIT, typename NIT + > + RC generic_sptrsv( + Vector< IOType, reference, Coords > &xb, + const Vector< InputType2, reference, Coords > &mask, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const size_t &n, + const Semiring &semiring, + const Subtraction &subtraction, + const Division &division, + const Phase &phase + ) { + // static checks + static_assert( masked || sparse, "Internal logic error; please submit a bug " + "report" ); + + //dynamic checks + assert( grb::size( xb ) == n ); + assert( !masked || grb::size( mask ) == n ); + assert( grb::nrows( T ) == n ); + assert( grb::ncols( T ) == n ); + + (void) descr; + (void) forward; + (void) masked; + (void) sparse; +#ifdef NDEBUG + (void) xb; + (void) mask; + (void) T; + (void) n; +#endif + (void) semiring; + (void) subtraction; + (void) division; + (void) phase; + std::cerr << "Warning: masked sptrsv not yet implemented\n"; + return grb::UNSUPPORTED; + } + + } // end grb::internal + + template< + Descriptor descr = descriptors::no_operation, + class Semiring, class Subtraction, class Division, + typename IOType, typename InputType1, + typename Coords, typename RIT, typename CIT, typename NIT + > + RC sptrsv( + Vector< IOType, reference, Coords > &xb, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const bool forward, + const Semiring &semiring = Semiring(), + const Subtraction &subtraction = Subtraction(), + const Division &division = Division(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Semiring >::value && + grb::is_operator< Subtraction >::value && + grb::is_operator< Division >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value, + void >::type * const = nullptr + ) { + // check contract + constexpr bool dense = descr & descriptors::dense; + const size_t n = size( xb ); + if( grb::nrows( T ) != n ) { + std::cerr << "Error, sptrsv (unmasked): matrix should have a number of rows " + << "equal to the size of the input/output vector\n"; + return grb::ILLEGAL; + } + if( grb::ncols( T ) != n ) { + std::cerr << "Error, sptrsv (unmasked): matrix should have a number of " + << "columns equal to the size of the input/output vector\n"; + return grb::ILLEGAL; + } + if( dense && grb::nnz( xb ) < n ) { + std::cerr << "Error, sptrsv (unmasked): sparse vector given in conjunction " + << "with a dense descriptor\n"; + return grb::ILLEGAL; + } + + // check trivial + if( n == 0 ) { return grb::SUCCESS; } + + // in the dense variant, resize does nothing + if( phase == grb::RESIZE ) { return grb::SUCCESS; } + + // check dense dispatch + if( dense || grb::nnz( xb ) == n ) { +#ifndef _H_GRB_REFERENCE_OMP_BLAS2 + IOType *__restrict__ const xb_p = internal::getRaw( xb ); + const NIT lo = 0; + const NIT no = n; + if( forward ) { + return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, true >( + xb_p, T, lo, no, semiring, subtraction, division, phase ); + } else { + return internal::dense_unmasked_sequential_sptrsv< descr, false, 0, false >( + xb_p, T, lo, no, semiring, subtraction, division, phase ); + } +#else + IOType * const xb_p = internal::getRaw( xb ); + // The below code is only for testing (DBG): + #if 0 + const auto sptrsv = internal::getSptrsvData( T ); + #pragma omp parallel num_threads(sptrsv->nThreads) + { + const auto &crs = internal::getCRS( T ); + const int *__restrict__ schedule = reinterpret_cast< int * >( + sptrsv->data[ omp_get_thread_num() ] ); + for( size_t step = 0; step < sptrsv->supersteps; ++step ) { + const int &lo = *schedule++; + const int &no = *schedule++; + const int upper_limit = lo + no; + for( int row_idx = lo; row_idx < upper_limit; ++row_idx ) { + // if not sorted, enable this variant instead: + // IOType div = 0; + // for( unsigned int k = crs.col_start[ row_idx ]; k < crs.col_start[ row_idx + 1 ]; ++k ) { + for( unsigned int k = crs.col_start[ row_idx ]; k < crs.col_start[ row_idx + 1 ] - 1; ++k ) { + const int &j = crs.row_index[ k ]; + const double &value = crs.values[ k ]; + // if not sorted, enable this if-else: + // if( j == row_idx ) { + // div = value; + // } else { + xb_p[ row_idx ] -= value * xb_p[ j ]; + //} + } + #pragma omp critical + xb_p[ row_idx ] /= crs.values[ crs.col_start[ row_idx + 1 ] - 1 ]; + } + #pragma omp barrier + } + } + return grb::SUCCESS; + #else + if( forward ) { + return internal::dense_unmasked_omp_sptrsv< descr, true >( + xb_p, T, n, semiring, subtraction, division, phase ); + } else { + return internal::dense_unmasked_omp_sptrsv< descr, false >( + xb_p, T, n, semiring, subtraction, division, phase ); + } + #endif +#endif + } else { + grb::Vector< IOType, reference, Coords > no_mask( 0 ); + if( forward ) { + return internal::generic_sptrsv< descr, false, true, true >( + xb, no_mask, T, n, + semiring, subtraction, division, phase + ); + } else { + return internal::generic_sptrsv< descr, false, true, false>( + xb, no_mask, T, n, + semiring, subtraction, division, phase + ); + } + } + } + + template< + Descriptor descr = descriptors::no_operation, + class Semiring, + class Subtraction, + class Division, + typename IOType, typename InputType1, typename InputType2, + typename Coords, typename RIT, typename CIT, typename NIT + > + RC sptrsv( + Vector< IOType, reference, Coords > &xb, + const Vector< InputType2, reference, Coords > &mask, + const Matrix< InputType1, reference, RIT, CIT, NIT > &T, + const bool forward, + const Semiring &semiring = Semiring(), + const Subtraction &subtraction = Subtraction(), + const Division &division = Division(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Semiring >::value && + grb::is_operator< Subtraction >::value && + grb::is_operator< Division >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value, + void >::type * const = nullptr + ) { + // check if can forward + if( grb::size( mask ) == 0 ) { + return sptrsv< descr >( xb, T, forward, semiring, subtraction, division, + phase ); + } + + // check contract + constexpr bool dense = descr & descriptors::dense; + const size_t n = size( xb ); + if( grb::size( mask ) != n ) { + std::cerr << "Error, grb::sptrsv (masked): mask and input/output vector " + << "should have the same size\n"; + return grb::ILLEGAL; + } + if( grb::nrows( T ) != n ) { + std::cerr << "Error, grb::sptrsv (masked): matrix should have a number of " + << "rows equal to the size of the input/output vector\n"; + return grb::ILLEGAL; + } + if( grb::ncols( T ) != n ) { + std::cerr << "Error, grb::sptrsv (masked): matrix should have a number of " + << "columns equal to the size of the input/output vector\n"; + return grb::ILLEGAL; + } + if( dense && grb::nnz( mask ) < n ) { + std::cerr << "Error, grb::sptrsv (masked): sparse mask but dense descriptor " + << "was given\n"; + return grb::ILLEGAL; + } + if( dense && grb::nnz( xb ) < n ) { + std::cerr << "Error, grb::sptrsv (masked): sparse input vector given but " + "dense descriptor was given\n"; + return grb::ILLEGAL; + } + + if( dense || (grb::nnz( xb ) == n && grb::nnz( mask ) == n) ) { + if( forward ) { + return internal::generic_sptrsv< descr, true, false, true >( + xb, mask, T, n, + semiring, subtraction, division, phase + ); + } else { + return internal::generic_sptrsv< descr, true, false, false >( + xb, mask, T, n, + semiring, subtraction, division, phase + ); + } + } else { + if( forward ) { + return internal::generic_sptrsv< descr, true, true, true >( + xb, mask, T, n, + semiring, subtraction, division, phase ); + } else { + return internal::generic_sptrsv< descr, true, true, false >( + xb, mask, T, n, + semiring, subtraction, division, phase ); + } + } + } + + namespace internal { + #ifndef _H_GRB_REFERENCE_OMP_BLAS2 /** * A nonzero wrapper for use with grb::eWiseLambda over matrices. diff --git a/include/graphblas/reference/config.hpp b/include/graphblas/reference/config.hpp index 82e067ffd..116ed0202 100644 --- a/include/graphblas/reference/config.hpp +++ b/include/graphblas/reference/config.hpp @@ -333,7 +333,34 @@ namespace grb { /** @} */ - } // namespace config + + /** + * Collects all optimisations with regards to tuning operations for specific + * inputs, i.e., #grb::PHASE::TUNE. + */ + namespace tuning { + + /** + * Collects all options for SpTRSV tuning. + */ + struct SpTRSV { + + /** + * There are three sorting modes implemented: + * - 0: the matrix rows are not sorted; + * - 1: the matrix rows have diagonal elements at position 0; + * - 2: the matrix rows are sorted. + * The above description assumes CRS, however, the chosen sorting mode is + * also applied to the CCS (in which case the above three points apply with + * `rows' substituted by `columns'). + */ + static constexpr int sortingMode = 0; + + }; + + } // namespace grb::config::tuning + + } // namespace grb::config } // namespace grb diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index c1b3a365d..1b99614ef 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -49,6 +49,7 @@ #include #include +#include #include #include "NonzeroWrapper.hpp" @@ -237,6 +238,259 @@ namespace grb { return A.valbuf[ 1 ]; } + /** + * \internal + * Retrieves an optimised sptrsv-specific schedule. + * \endinternal + */ + template< typename InputType, typename RIT, typename CIT, typename NIT > + const internal::SptrsvSchedule< NIT > * getSptrsvData( + const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A + ) noexcept { + return A.sptrsvSchedule; + } + + template< + typename InputType, typename RIT, typename CIT, typename NIT, + typename NumRangesIt + > + void allocateSptrsvSchedule( + grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, + const NIT n, const size_t nThreads, const size_t nSteps, + const NumRangesIt nRanges, const NumRangesIt nRanges_end + ) { + if( A.sptrsvSchedule != nullptr ) { + throw std::runtime_error( "SptrsvSchedule was already initialised" ); + } + A.sptrsvSchedule = new SptrsvSchedule(n, nThreads); + + auto &sptrsv = *(A.sptrsvSchedule); + + sptrsv.supersteps = nSteps; + { + std::vector< NIT > tmp( nRanges, nRanges_end ); + sptrsv.nRanges = std::move( tmp ); + } + + // disable default trivial schedule + sptrsv.data[ 0 ] = nullptr; + sptrsv.endPositions[ 0 ] = nullptr; + + #pragma omp parallel + { + auto localIt = nRanges; + const size_t s = omp_get_thread_num(); + const size_t actualNumThreads = omp_get_num_threads(); + if( actualNumThreads < nThreads ) { + throw std::runtime_error( "Unexpected number of threads" ); + } + std::advance( localIt, s ); + if( sptrsv.data[ s ] || sptrsv.endPositions[ s ] ) { + throw std::runtime_error( "A thread-local schedule already existed" ); + } + sptrsv.alloc( s, *localIt ); + } + } + + /** + * This schedule ingestion method expects the schedule as an iterator-pair + * over a container of doubly-nested vectors. Each such vector v iterated + * over, corresponds to a superstep. For a given superstep's v, v[s][k] has + * that s indicates the thread ID and k indicates the floor( k / 2 )-th range + * to be processed in that superstep and at that thread. Even-numbered k + * indicate lower bounds of the range, while odd k indicate their upper + * bounds. + * + * \note This is not an ideal way to pass around schedules, as it assumes the + * container element types are doubly-nested STL vectors-- it is hence + * not a generic approach. + */ + template< + typename InputType, typename RIT, typename CIT, typename NIT, typename It + > + void setSptrsvSchedule( + grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, + const It &bounds_begin, const It &bounds_end, + const size_t nThreads + ) { + if( A.sptrsvSchedule == nullptr ) { + throw std::runtime_error( "SptrsvSchedule was not initialised" ); + } + auto &sptrsv = *(A.sptrsvSchedule); + if( sptrsv.nThreads != nThreads ) { + throw std::runtime_error( "SptrsvSchedule was allocated with different " + "nThreads" ); + } + + // re-sort A, if enabled + sptrsv.is_sorted = true; + if( config::tuning::SpTRSV::sortingMode == 2 ) { + // CRS first + { + const auto &crs = internal::getCRS( A ); + const size_t nThreads = A.m < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) + for( size_t i = 0; i < A.m; ++i ) { + std::vector< std::pair< int, double > > pairs; + for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k ) { + pairs.push_back( std::make_pair( crs.row_index[ k ], crs.values[ k ] ) ); + } + std::sort( pairs.begin(), pairs.end(), + []( const std::pair< int, double > &left, const std::pair< int, double > &right ) { + return left.first < right.first; + } ); + auto it = pairs.cbegin(); + for( size_t k = crs.col_start[ i ]; k < crs.col_start[ i + 1 ]; ++k, ++it ) { + assert( it != pairs.cend() ); + crs.row_index[ k ] = it->first; + crs.values[ k ] = it->second; + } + } + } + // then CCS + { + const auto &ccs = internal::getCCS( A ); + const size_t nThreads = A.n < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) + for( size_t i = 0; i < A.n; ++i ) { + std::vector< std::pair< int, double > > pairs; + for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k ) { + pairs.push_back( std::make_pair( ccs.row_index[ k ], ccs.values[ k ] ) ); + } + std::sort( pairs.begin(), pairs.end(), + []( const std::pair< int, double > &left, const std::pair< int, double > &right ) { + return left.first < right.first; + } ); + auto it = pairs.cbegin(); + for( size_t k = ccs.col_start[ i ]; k < ccs.col_start[ i + 1 ]; ++k, ++it ) { + assert( it != pairs.cend() ); + ccs.row_index[ k ] = it->first; + ccs.values[ k ] = it->second; + } + } + } + } else if( config::tuning::SpTRSV::sortingMode == 1 ) { + { + const auto &crs = internal::getCRS( A ); + const size_t nThreads = A.m < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) + for( size_t i = 0; i < A.m; ++i ) { + const auto index_it = std::find( + crs.row_index + crs.col_start[ i ], + crs.row_index + crs.col_start[ i + 1 ], + i + ); + if( !std::is_void< InputType >::value ) { + const size_t index = std::distance( crs.row_index, index_it ); + std::swap( crs.values[ index ], crs.values[ crs.col_start[ i ] ] ); + } + std::swap( *index_it, crs.row_index[ crs.col_start[ i ] ] ); + } + } + { + const auto &ccs = internal::getCCS( A ); + const size_t nThreads = A.n < config::OMP::minLoopSize() + ? 1 + : std::min( + config::OMP::threads(), + A.nz / config::CACHE_LINE_SIZE::value() + ); + #pragma omp parallel for num_threads( nThreads ) + for( size_t i = 0; i < A.n; ++i ) { + const auto start = ccs.row_index + ccs.col_start[ i ]; + const auto end = ccs.row_index + ccs.col_start[ i + 1 ]; + const auto index_it = std::find( start, end, i ); + if( index_it == end ) { + throw std::runtime_error( "No diagonal element present" ); + } else if( index_it != start ) { + if( !std::is_void< InputType >::value ) { + const size_t index = std::distance( ccs.row_index, index_it ); + std::swap( ccs.values[ index ], ccs.values[ ccs.col_start[ i ] ] ); + } + std::swap( *index_it, ccs.row_index[ ccs.col_start[ i ] ] ); + } + } + } + } else { + sptrsv.is_sorted = false; + } + + sptrsv.is_simple = true; + #pragma omp parallel + { + bool simple_local = true; + It bounds = bounds_begin; + const size_t actualNumThreads = omp_get_num_threads(); + if( actualNumThreads != nThreads ) { + throw std::runtime_error( "Unexpected number of threads" ); + } + const size_t s = omp_get_thread_num(); + // get buffer as an array of NIT, which we will write to in one pass + NIT *__restrict__ array = reinterpret_cast< NIT * >(sptrsv.data[ s ]); + // get the number of ranges for a given superstep at this thread that this + // function should populate + NIT *__restrict__ end = reinterpret_cast< NIT * >(sptrsv.endPositions[ s ]); + // dynamic assertion: there should be at least one superstep + assert( bounds != bounds_end ); + // start ingestion + size_t count = 0; + do { + assert( bounds->size() == nThreads ); + size_t nRanges = 0; + // get this superstep's and this thread's vector v[s] + const auto &v = (*bounds)[s]; + // go range-by-range + for( auto it = v.cbegin(); it != v.cend(); ++it ) { + // we have a range, parse it + const NIT l = *it++; + assert( it != v.cend() ); + const NIT h = *it + 1; + assert( h >= l ); + const NIT n = h - l; + if( count >= sptrsv.supersteps ) { + throw std::runtime_error( "Too many supersteps" ); + } + // store it + assert( nRanges < sptrsv.nRanges[ s ] ); + *array++ = l; + *array++ = n; + (void) ++nRanges; + } + // store the number of ranges in the end array + assert( count < sptrsv.supersteps ); + if( nRanges != 1 ) { + simple_local = false; + } + *end++ = nRanges; + // forward to the next superstep + (void) ++count; + (void) ++bounds; + } while( bounds != bounds_end ); + if( count != sptrsv.supersteps ) { + throw std::runtime_error( "Unexpected number of supersteps" ); + } + if( !simple_local ) { + #pragma omp critical + sptrsv.is_simple = false; + } + } + } + template< Descriptor descr, bool input_dense, bool output_dense, @@ -1225,6 +1479,30 @@ namespace grb { const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A ) noexcept; + template< typename InputType, typename RIT, typename CIT, typename NIT > + friend const internal::SptrsvSchedule< NIT > * internal::getSptrsvData( + const grb::Matrix< InputType, reference, RIT, CIT, NIT > &A + ) noexcept; + + template< + typename InputType, typename RIT, typename CIT, typename NIT, + typename NumRangesIt + > + friend void internal::allocateSptrsvSchedule( + grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, + const NIT n, const size_t nThreads, const size_t nSteps, + const NumRangesIt nRanges, const NumRangesIt nRanges_end + ); + + template< + typename InputType, typename RIT, typename CIT, typename NIT, typename It + > + friend void internal::setSptrsvSchedule( + grb::Matrix< InputType, reference, RIT, CIT, NIT > &A, + const It &bounds, const It &bounds_end, + const size_t nThreads + ); + friend const grb::Matrix< D, reference, ColIndexType, ColIndexType, NonzeroIndexType @@ -1351,6 +1629,11 @@ namespace grb { */ utils::AutoDeleter< char > _local_deleter[ 6 ]; + /** + * Optimised schedule for sptrsv operations on this matrix. + */ + internal::SptrsvSchedule< NonzeroIndexType > * sptrsvSchedule; + /** * Internal constructor for manual construction of matrices. * @@ -1403,7 +1686,7 @@ namespace grb { id( std::numeric_limits< uintptr_t >::max() ), remove_id( false ), m( _m ), n( _n ), cap( _cap ), nz( _offset_array[ _m ] ), coorArr{ nullptr, buf1 }, coorBuf{ nullptr, buf2 }, - valbuf{ nullptr, buf3 } + valbuf{ nullptr, buf3 }, sptrsvSchedule( nullptr ) { assert( (_m > 0 && _n > 0) || _column_indices[ 0 ] == 0 ); CRS.replace( _values, _column_indices ); @@ -1424,6 +1707,8 @@ namespace grb { const size_t rows, const size_t cols, const size_t cap_in ) { + // SpTrsvSchedule should be manually set always by a requested tuning phase + sptrsvSchedule = nullptr; #ifdef _DEBUG_REFERENCE_MATRIX std::cerr << "\t in Matrix< reference >::initialize...\n" << "\t\t matrix size " << rows << " by " << cols << "\n" @@ -1596,6 +1881,7 @@ namespace grb { _deleter[ i ] = std::move( other._deleter[ i ] ); _local_deleter[ i ] = std::move( other._local_deleter[ i ] ); } + sptrsvSchedule = other.sptrsvSchedule; // invalidate other fields for( unsigned int i = 0; i < 2; ++i ) { @@ -1608,6 +1894,7 @@ namespace grb { other.n = 0; other.cap = 0; other.nz = 0; + other.sptrsvSchedule = nullptr; } /** @@ -2228,6 +2515,9 @@ namespace grb { #endif internal::reference_mapper.remove( id ); } + if( sptrsvSchedule ) { + delete sptrsvSchedule; + } } /** diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 5cd03cf77..049af30e9 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -374,6 +374,12 @@ add_grb_executables( launch_benchmark_frommpi_manual launcherAndBenchmarker.cpp COMPILE_DEFINITIONS NO_LPF_AUTO_INIT ) +# tests for experimental features + +add_grb_executables( dense_sptrsv dense_sptrsv.cpp + BACKENDS reference reference_omp +) + # targets to list and build the test for this category get_property( unit_tests_list GLOBAL PROPERTY tests_category_unit ) add_custom_target( "list_tests_category_unit" diff --git a/tests/unit/dense_sptrsv.cpp b/tests/unit/dense_sptrsv.cpp new file mode 100644 index 000000000..c7138fe94 --- /dev/null +++ b/tests/unit/dense_sptrsv.cpp @@ -0,0 +1,320 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "graphblas/algorithms/matrix_factory.hpp" +#include "graphblas/utils/timer.hpp" + +#include "graphblas.hpp" + + +using namespace grb; + +struct input { + size_t n; + size_t rep; +}; + +struct output { + enum grb::RC error_code; + grb::utils::TimerResults times; +}; + +template< typename T, class Semiring > +bool checkVectorDenseAndEqualTo( + const grb::Vector< T > &x, const T val, struct output &out, + const Semiring &semiring +) { + size_t count = 0; + bool error = false; + std::set< size_t > localSet; + const size_t n = grb::size( x ); + for( const auto &pair : x ) { + (void) ++count; + localSet.insert( pair.first ); + if( pair.second != val ) { + std::cout << "unexpected entry ( " << pair.first << ", " << pair.second + << " ); expected value " << val << "\n"; + error = true; + } + } + if( grb::collectives<>::allreduce( count, semiring.getAdditiveOperator() ) + != grb::SUCCESS + ) { + std::cerr << "error during computing checksum (I)\n"; + out.error_code = grb::PANIC; + return false; + } + if( count != n ) { + std::cout << "expected " << n << " nonzeroes, got " << count << "\n"; + error = true; + } + count = localSet.size(); + if( grb::collectives<>::allreduce( count, semiring.getAdditiveOperator() ) + != grb::SUCCESS + ) { + std::cerr << "error during computing checksum (II)\n"; + out.error_code = grb::PANIC; + return false; + } + if( count != n ) { + std::cout << "expected " << n << " indices, got " << count << "\n"; + error = true; + } + if( out.error_code == grb::SUCCESS ) { + if( error ) { + out.error_code = grb::FAILED; + return false; + } + } + return true; +} + +// main label propagation algorithm +void grbProgram( const struct input &data_in, struct output &out ) { + out.error_code = grb::PANIC; + grb::utils::Timer timer; + + const size_t s = grb::spmd<>::pid(); +#ifndef NDEBUG + assert( s < grb::spmd<>::nprocs() ); +#else + (void) s; +#endif + + // get input n and test case + const size_t n = data_in.n; + + // setup + grb::Vector< double > x( n ); + grb::Matrix< double > A = grb::algorithms::matrices< double >::identity( n ); + Semiring< + grb::operators::add< double >, grb::operators::mul< double >, + grb::identities::zero, grb::identities::one + > semiring; + grb::operators::divide< double > division; + grb::operators::subtract< double > subtraction; + + out.error_code = grb::set( x, 3.14 ); + if( out.error_code ) { + std::cerr << "Error during test initialisation\n"; + return; + } + if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { + std::cerr << "NOTE: the aforementioned errors occurred during test " + << "initialisation\n"; + return; + } + out.times.preamble = timer.time(); + + if( s == 0 ) { + std::cout << "Test 1: "; + } + timer.reset(); + out.error_code = grb::sptrsv( x, A, false, semiring, subtraction, division ); + if( out.error_code ) { + std::cerr << "test returned error\n"; + } + if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { return; } + out.times.postamble += timer.time(); + timer.reset(); + for( size_t rep = 0; rep < data_in.rep; ++rep ) { + (void) grb::sptrsv( x, A, false, semiring, subtraction, division ); + } + out.times.useful += timer.time(); + + assert( out.error_code == grb::SUCCESS ); + if( s == 0 ) { + std::cout << "\b\b 2: "; + } + timer.reset(); + out.error_code = grb::set< grb::descriptors::dense >( x, 2.14 ); + if( out.error_code ) { + std::cerr << "Error during test initialisation\n"; + return; + } + if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { + std::cerr << "NOTE: aforementioned errors occurred during initialisation\n"; + return; + } + out.times.preamble += timer.time(); + timer.reset(); + out.error_code = grb::sptrsv( x, A, true, semiring, subtraction, division ); + if( out.error_code ) { + std::cerr << "test returned error\n"; + } + if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { return; } + out.times.postamble += timer.time(); + timer.reset(); + for( size_t rep = 0; rep < data_in.rep; ++rep ) { + (void) grb::sptrsv( x, A, true, semiring, subtraction, division ); + } + out.times.useful += timer.time(); + + assert( out.error_code == grb::SUCCESS ); + if( s == 0 ) { + std::cout << "\b\b 3: "; + } + timer.reset(); + out.error_code = grb::set< grb::descriptors::dense >( x, 3.14 ); + if( out.error_code ) { + std::cerr << "Error during test initialisation\n"; + return; + } + if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { + std::cerr << "NOTE: aforementioned errors occurred during initialisation\n"; + return; + } + out.times.preamble += timer.time(); + timer.reset(); + out.error_code = grb::sptrsv< grb::descriptors::dense >( x, A, false, semiring, + subtraction, division ); + if( out.error_code ) { + std::cerr << "test returned error\n"; + } + if( !checkVectorDenseAndEqualTo( x, 3.14, out, semiring ) ) { return; } + out.times.postamble += timer.time(); + timer.reset(); + for( size_t rep = 0; rep < data_in.rep; ++rep ) { + (void) grb::sptrsv< grb::descriptors::dense >( x, A, false, semiring, + subtraction, division ); + } + out.times.useful += timer.time(); + + assert( out.error_code == grb::SUCCESS ); + if( s == 0 ) { + std::cout << "\b\b 4: "; + } + timer.reset(); + out.error_code = grb::set< grb::descriptors::dense >( x, 2.14 ); + if( out.error_code ) { + std::cerr << "Error during test initialisation\n"; + return; + } + if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { + std::cerr << "NOTE: aforementioned errors occurred during initialisation\n"; + return; + } + out.times.preamble += timer.time(); + timer.reset(); + out.error_code = grb::sptrsv< grb::descriptors::dense >( x, A, true, semiring, + subtraction, division ); + if( out.error_code ) { + std::cerr << "test returned error\n"; + } + if( !checkVectorDenseAndEqualTo( x, 2.14, out, semiring ) ) { return; } + out.times.postamble += timer.time(); + timer.reset(); + for( size_t rep = 0; rep < data_in.rep; ++rep ) { + (void) grb::sptrsv< grb::descriptors::dense >( x, A, true, semiring, + subtraction, division ); + } + out.times.useful += timer.time(); + + std::cout << "OK\n"; +} + +// main function will execute in serial or as SPMD +int main( int argc, char ** argv ) { + // sanity check + if( argc < 3 || argc > 5 ) { + std::cout << "Usage: " << argv[ 0 ] << " " + << "(inner repititions) (outer repititions)" << std::endl; + return 0; + } + std::cout << "Test executable: " << argv[ 0 ] << std::endl; + + // the input struct + struct input in; + in.n = atoi( argv[ 1 ] ); + in.rep = grb::config::BENCHMARKING::inner(); + size_t outer = grb::config::BENCHMARKING::outer(); + char * end = NULL; + if( argc >= 3 ) { + in.rep = strtoumax( argv[ 2 ], &end, 10 ); + if( argv[ 2 ] == end ) { + std::cerr << "Could not parse argument for number of inner " + << "repetitions." << std::endl; + return 25; + } + } + if( argc >= 4 ) { + outer = strtoumax( argv[ 3 ], &end, 10 ); + if( argv[ 3 ] == end ) { + std::cerr << "Could not parse argument for number of outer " + << "reptitions." << std::endl; + return 30; + } + } + + std::cout << "Executable called with parameters: problem size " << in.n + << ", inner = " << in.rep << ", outer = " << outer << "." << std::endl; + + // the output struct + struct output out; + + // run the program one time to infer number of inner repititions + if( in.rep == 0 ) { + in.rep = 1; + grb::Launcher< AUTOMATIC > launcher; + const enum grb::RC rc = launcher.exec( &grbProgram, in, out, true ); + if( rc != SUCCESS ) { + std::cerr << "launcher.exec returns with non-SUCCESS error code " + << grb::toString( rc ) << std::endl; + return 40; + } + // set guesstimate for inner repititions: a single experiment should take at least a second + in.rep = static_cast< double >( 1000.0 / out.times.useful ) + 1; + std::cout << "Auto-selected number of inner repetitions is " + << in.rep << " (at an estimated time of " + << out.times.useful << " ms. of useful work per benchmark).\n"; + } + + // start benchmarks + grb::Benchmarker< AUTOMATIC > benchmarker; + const enum grb::RC rc = benchmarker.exec( &grbProgram, in, out, 1, outer, true ); + if( rc != SUCCESS ) { + std::cerr << "launcher.exec returns with non-SUCCESS error code " + << grb::toString( rc ) << std::endl; + return 50; + } + + // done + if( out.error_code != SUCCESS ) { + std::cout << "Test FAILED\n" << std::endl; + std::cerr << std::flush; + return out.error_code; + } + std::cout << "Test OK\n" << std::endl; + return 0; +} + diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh index 0b49d2421..e6d3f7644 100755 --- a/tests/unit/unittests.sh +++ b/tests/unit/unittests.sh @@ -197,6 +197,19 @@ for MODE in ${MODES}; do grep 'Test OK' ${TEST_OUT_DIR}/mxm_crs_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" echo " " + echo ">>> [x] [ ] Testing grb::sptrsv (unmasked, dense vectors) on" + echo " small simple matrices, re-entrant" + $runner ${TEST_BIN_DIR}/dense_sptrsv_${MODE}_${BACKEND} 10 1000 3 &> ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log + head -1 ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log + grep 'Test OK' ${TEST_OUT_DIR}/dense_sptrsv_${MODE}_${BACKEND}_${T}.log || echo "Test FAILED" + echo " " + + echo ">>> [x] [ ] Testing grb::sptrsv (unmasked, dense vectors) on" + echo " large simple matrices, re-entrant" + $runner ${TEST_BIN_DIR}/dense_sptrsv_${MODE}_${BACKEND} 1000000 1 3 &> ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log + head -1 ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log + grep 'Test OK' ${TEST_OUT_DIR}/dense_sptrsv_large_${MODE}_${BACKEND}_${T}.log || echo "Test FAILED" + echo " " fi echo "#################################################################"