diff --git a/include/graphblas/base/io.hpp b/include/graphblas/base/io.hpp index 493dfcfea..c904570fc 100644 --- a/include/graphblas/base/io.hpp +++ b/include/graphblas/base/io.hpp @@ -1218,15 +1218,17 @@ namespace grb { * -# #grb::descriptors::invert_mask, and * -# #grb::descriptors::structural. * - * However, and differently from most ALP primtivies, the + * However, and differently from most ALP/GraphBLAS primitivies, the * #grb::descriptors::invert_mask and #grb::descriptors::structural are * mutually exclusive for this primitive. * \endparblock * * @tparam OutputType The type of each element in the given matrix. * @tparam MaskType The type of each element in the given mask. - * @tparam ValueType The type of the given value. Should be convertible - * to \a OutputType. + * @tparam ValueType The type of the given value. + * + * The given \a ValueType must be convertible to \a OutputType. + * * @tparam RIT The integer type for encoding row indices. * @tparam CIT The integer type for encoding column indices. * @tparam NIT The integer type for encoding nonzero indices. @@ -1244,51 +1246,50 @@ namespace grb { * the default is #grb::EXECUTE. * * In #grb::RESIZE mode: - * @returns #grb::SUCCESS When the capacity of \a C has been (made) sufficient - * to store the requested output. + * + * @returns #grb::SUCCESS When the capacity of \a C (has been made or already + * was) sufficient to store the requested output. * @returns #grb::OUTOFMEM When out-of-memory conditions have been met while - * executing. If this error code is returned, \a C + * resizing \a C. If this error code is returned, \a C * shall be unmodified compared to its state at * function entry. * * In #grb::EXECUTE mode: - * @returns #grb::SUCCESS When the call completes successfully. + * + * @returns #grb::SUCCESS When the computation has completed or will execute + * successfully. * @returns #grb::ILLEGAL When \a C did not have enough capacity to store the * output of the requested computation. * * Either mode may additionally return: - * @returns #grb::ILLEGAL In case the given \a mask was empty. + * * @returns #grb::MISMATCH In case \a C and \a mask have mismatching sizes. * @returns #grb::PANIC In case an unmitigable error was encountered. The - * caller is suggested to exit gracefully, and in any - * case to not make any further calls to ALP. + * caller, when encountering this return code, is + * suggested to exit gracefully and to not make any + * further calls to ALP. * * When \a descr includes #grb::descriptors::no_casting then code shall not * compile if one of the following conditions are met: * -# \a ValueType does not match \a OutputType; or * -# \a MaskType does not match bool. * - * In these cases, the code shall not compile: implementations must throw - * a static assertion failure in this case. - * * Similarly, it is forbidden to call this function with both following * descriptors simultaneously: * - #grb::descriptors::invert_mask \em and #grb::descriptors::structural. * * The use of the #grb::descriptors::structural_complement descriptor hence is - * is forbidden also. Implementations shall throw a static assertion failure - * if the user nonetheless asks for structural mask inversion. + * is forbidden also. These conditions, when encountered, should lead to + * compile-time errors also. + * + * \note One vehicle to ensure compilation does not occur in these cases is via + * static_assert. * * \parblock * \par Performance semantics * Each backend must define performance semantics for this primitive. * * @see perfSemantics - * - * \warning Generally, if \a mask equals \a C and the mask is non-structural, - * then optimised implementations will assign higher costs than when - * \a mask does not equal \a C. This is because the nonzero structure - * update cannot be done in-place. * \endparblock */ template< @@ -1320,6 +1321,128 @@ namespace grb { return UNSUPPORTED; } + /** + * Sets all values of a matrix to that of a given source matrix, if and only if + * the corresponding value coordinates evaluate true at the given mask + * matrix. + * + * @tparam descr The descriptor used for this operation. + * + * \parblock + * \par Accepted descriptors + * -# #grb::descriptors::no_operation, + * -# #grb::descriptors::no_casting, + * -# #grb::descriptors::invert_mask, + * -# #grb::descriptors::structural + * + * However, and differently from most ALP primtivies, the + * #grb::descriptors::invert_mask and #grb::descriptors::structural are + * mutually exclusive for this primitive. + * \endparblock + * + * @tparam OutputType The type of each element in the destination matrix. + * @tparam MaskType The type of each element in the output mask. + * @tparam ValueType The type of each element in the source matrix. + * + * The given \a ValueType must be convertible to \a OutputType. + * + * \internal + * @tparam RIT1 The integer type for encoding row indices in \a C. + * @tparam CIT1 The integer type for encoding column indices in \a C. + * @tparam NIT1 The integer type for encoding nonzero indices in \a C. + * @tparam RIT2 The integer type for encoding row indices in \a mask. + * @tparam CIT2 The integer type for encoding column indices in \a mask. + * @tparam NIT2 The integer type for encoding nonzero indices in \a mask. + * @tparam RIT3 The integer type for encoding row indices in \a A. + * @tparam CIT3 The integer type for encoding column indices in \a A. + * @tparam NIT3 The integer type for encoding nonzero indices in \a A. + * @tparam backend The backend selected for executing this primitive. + * \endinternal + * + * @param[out] C The matrix that will be a masked copy of \a A. + * @param[in] mask Matrix that acts as output mask on \a C. + * @param[in] A The source matrix which will (partially) be copied to + * \a A. + * @param[in] phase Which #grb::Phase of the operation is requested. Optional; + * the default is #grb::EXECUTE. + * + * In #grb::RESIZE mode: + * + * @returns #grb::SUCCESS When the capacity of \a C (has been made or already + * was) sufficient to store the requested output. + * @returns #grb::OUTOFMEM When out-of-memory conditions were met while + * resizing \a C. If this error code is returned, \a C + * shall be left unmodified compared to its state at + * function entry. + * + * In #grb::EXECUTE mode: + * + * @returns #grb::SUCCESS When the computation has completed or will execute + * successfully. + * @returns #grb::ILLEGAL When \a C did not have enough capacity to store the + * output of the requested computation. + * + * Either mode may additionally return: + * + * @returns #grb::MISMATCH When \a A and \a C have mismatching sizes. + * @returns #grb::MISMATCH When \a C and \a mask have mismatching sizes. + * @returns #grb::PANIC In case an unmitigable error was encountered. The + * caller, when encountering this return code, is + * suggested to exit the program gracefully and to not + * make any further calls to ALP. + * + * When \a descr includes #grb::descriptors::no_casting, then code shall not + * comile if one of the following conditions are met: + * -# \a ValueType does not match \a OutputType; or + * -# \a MaskType does not match bool. + * + * Similarly, it is forbidden to call this function with both following + * descriptors simultaneously: + * - #grb::descriptors::invert_mask \em and #grb::descriptors::structural. + * + * The use of the #grb::descriptors::structural_complement descriptor hence is + * is forbidden also. These conditions should lead to compile-time errors also. + * + * \note One vehicle to ensure compilation does not occur in these cases is via + * static_assert. + * + * \parblock + * \par Performance semantics + * Each backend must define performance semantics for this primitive. + * + * @see perfSemantics + * \endparblock + */ + template< + Descriptor descr = descriptors::no_operation, + typename OutputType, typename MaskType, typename ValueType, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3, + Backend backend + > + RC set( + Matrix< OutputType, backend, RIT1, CIT1, NIT1 > &C, + const Matrix< MaskType, backend, RIT2, CIT2, NIT2 > &mask, + const Matrix< ValueType, backend, RIT3, CIT3, NIT3 > &A, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< ValueType >::value && + !grb::is_object< MaskType >::value, + void >::type * const = nullptr + ) noexcept { +#ifndef NDEBUG + const bool should_not_call_base_matrix_masked_matrix_set = false; + assert( should_not_call_base_matrix_masked_matrix_set ); +#endif + (void) C; + (void) mask; + (void) A; + (void) phase; + return UNSUPPORTED; + } + /** * Sets the element of a given vector at a given position to a given value. * @@ -1763,7 +1886,7 @@ namespace grb { * successfully, then a call to this function shall return #grb::SUCCESS. * * The are several other cases in which the computation of nonblocking - * primtives is forced: + * primitives is forced: * -# whenever an output iterator of an output container of any of the non- * blocking primitives is requested; and * -# whenever an output container of any of the non-blocking primitives is diff --git a/include/graphblas/bsp1d/init.hpp b/include/graphblas/bsp1d/init.hpp index 582479bf1..d5798d894 100644 --- a/include/graphblas/bsp1d/init.hpp +++ b/include/graphblas/bsp1d/init.hpp @@ -34,6 +34,7 @@ #include #include +#include //uintptr_t #include //assertions #include diff --git a/include/graphblas/bsp1d/io.hpp b/include/graphblas/bsp1d/io.hpp index edada4191..b8181041a 100644 --- a/include/graphblas/bsp1d/io.hpp +++ b/include/graphblas/bsp1d/io.hpp @@ -32,7 +32,6 @@ #include #endif -#include "graphblas/blas1.hpp" // for grb::size #include "graphblas/nonzeroStorage.hpp" // the below transforms an std::vector iterator into an ALP/GraphBLAS-compatible @@ -976,20 +975,166 @@ namespace grb { #ifdef _BSP1D_IO_DEBUG std::cout << "Called grb::set( matrix, mask, value ) (BSP1D)\n"; #endif + // catch trivial cases const size_t m = nrows( C ); const size_t n = ncols( C ); + if( m == 0 || n == 0 ) { return SUCCESS; } - // dynamic checks (I) + // dynamic checks + if( nrows( mask ) == 0 || ncols( mask ) == 0 ) { + return ILLEGAL; + } if( m != nrows( mask ) || n != ncols( mask ) ) { return MISMATCH; } - // catch trivial case +#ifdef _BSP1D_IO_DEBUG + std::cout << "\t delegating to final backend\n"; +#endif + RC ret = SUCCESS; + // Take care that local matrices may be empty, even if the global matrix is + // not. Processes with empty local matrices will not delegate (no-op). + { + auto &local_C = internal::getLocal( C ); + const auto &local_mask = internal::getLocal( mask ); + const size_t local_m = nrows( local_C ); + const size_t local_n = ncols( local_C ); + assert( local_m == nrows( local_mask ) ); + assert( local_n == ncols( local_mask ) ); + if( local_m > 0 && local_n > 0 ) { + ret = set< descr >( local_C, local_mask, val, phase ); + } + } + + // in the self-masked case, there is no way an error could occur + if( (descr & descriptors::structural) && getID( C ) == getID( mask ) ) { +#ifdef _BSP1D_IO_DEBUG + std::cout << "\t structural self-masking detected, which allows trivial " + "exit\n"; // since the nnz nor capacity would never change +#endif + assert( ret == SUCCESS ); + return ret; + } + + // in all other cases, in either mode (resize or execute), we must check for + // errors +#ifdef _BSP1D_IO_DEBUG + std::cout << "\t all-reducing error code\n"; +#endif + if( collectives< BSP1D >::allreduce( ret, operators::any_or< RC >() ) + != SUCCESS + ) { + return PANIC; + } + +#ifdef _BSP1D_IO_DEBUG + std::cout << "\t all-reduced error code is " << toString( ret ) << "\n"; +#endif + if( phase == RESIZE ) { + if( ret == SUCCESS ) { +#ifdef _BSP1D_IO_DEBUG + std::cout << "\t resize phase detected -- synchronising capacity\n"; +#endif + ret = internal::updateCap( C ); + if( ret != SUCCESS ) { + std::cerr << "Error updating capacity: " << toString( ret ) << "\n"; + } + } + } else { + assert( phase == EXECUTE ); + if( ret == SUCCESS ) { +#ifdef _BSP1D_IO_DEBUG + std::cout << "\t execute phase detected -- synchronising nnz count\n"; +#endif + ret = internal::updateNnz( C ); + if( ret != SUCCESS ) { + std::cerr << "Error updating output number of nonzeroes: " + << toString( ret ) << "\n"; + } + } else if( ret == ILLEGAL ) { +#ifdef _BSP1D_IO_DEBUG + std::cout << "\t delegate returns ILLEGAL, clearing output\n"; +#endif + const RC clear_rc = clear( C ); + if( clear_rc != SUCCESS ) { + ret = PANIC; + } + } else { + if( ret != PANIC ) { + std::cerr << "Warning: unexpected error code in grb::set( matrix, mask, " + << "value ) (BSP1D). Please submit a bug report.\n"; + } + assert( ret == PANIC ); + } + } + +#ifdef _BSP1D_IO_DEBUG + std::cout << "\t done; returning " << toString( ret ) << "\n"; +#endif + + // done + return ret; + } + + /** + * The implementation can trivially rely on the final backend, however, the + * capacity or nonzero count of the output can in some cases differ. The below + * implementation mostly deals with that logic. + */ + template< + Descriptor descr = descriptors::no_operation, + typename DataType, typename RIT1, typename CIT1, typename NIT1, + typename MaskType, typename RIT2, typename CIT2, typename NIT2, + typename ValueType = DataType, typename RIT3, typename CIT3, typename NIT3 + > + RC set( + Matrix< DataType, BSP1D, RIT1, CIT1, NIT1 > &C, + const Matrix< MaskType, BSP1D, RIT2, CIT2, NIT2 > &mask, + const Matrix< ValueType, BSP1D, RIT3, CIT3, NIT3 > &A, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< DataType >::value && + !grb::is_object< ValueType >::value && + !grb::is_object< MaskType >::value + >::type * const = nullptr + ) noexcept { + // static checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< ValueType, DataType >::value + ), "grb::set( matrix, mask, matrix ) (BSP1D)", + "called with non-matching value types" + ); + NO_CAST_ASSERT( + ( !(descr & descriptors::no_casting) || + std::is_same< MaskType, bool >::value ), + "grb::set( matrix, mask, matrix ) (BSP1D)", + "called with non-Boolean mask value type" + ); + static_assert( !( (descr & descriptors::structural) && + (descr & descriptors::invert_mask) + ), "grb::set( matrix, mask, matrix ) (BSP1D): Primitives with matrix " + "outputs may not employ structurally inverted masking" + ); +#ifdef _BSP1D_IO_DEBUG + std::cout << "Called grb::set( matrix, mask, matrix ) (BSP1D)\n"; +#endif + const size_t m = nrows( C ); + const size_t n = ncols( C ); + + // dynamic checks (I) + if( m != nrows( A ) || n != ncols( A ) ) { + return MISMATCH; + } + + // catch trivial cases if( m == 0 || n == 0 ) { return SUCCESS; } + if( nrows( mask ) == 0 || ncols( mask ) == 0 ) { + return set< descr >( C, A, phase ); + } // dynamic checks (II) - if( nrows( mask ) == 0 || ncols( mask ) == 0 ) { - return ILLEGAL; + if( m != nrows( mask ) || n != ncols( mask ) ) { + return MISMATCH; } #ifdef _BSP1D_IO_DEBUG @@ -1000,13 +1145,16 @@ namespace grb { // not. Processes with empty local matrices will not delegate (no-op). { auto &local_C = internal::getLocal( C ); + const auto &local_A = internal::getLocal( A ); const auto &local_mask = internal::getLocal( mask ); const size_t local_m = nrows( local_C ); const size_t local_n = ncols( local_C ); assert( local_m == nrows( local_mask ) ); assert( local_n == ncols( local_mask ) ); + assert( local_m == nrows( local_A ) ); + assert( local_n == ncols( local_A ) ); if( local_m > 0 && local_n > 0 ) { - ret = set< descr >( local_C, local_mask, val, phase ); + ret = set< descr >( local_C, local_mask, local_A, phase ); } } @@ -1625,8 +1773,9 @@ namespace grb { // a pipeline depth of 2 is sufficient. constexpr size_t iteration_overlaps = 2; - const std::unique_ptr< size_t > first_nnz_per_thread( - new size_t[ num_threads * iteration_overlaps ]() + const std::unique_ptr< size_t, void(*)(size_t * const) > first_nnz_per_thread( + new size_t[ num_threads * iteration_overlaps ](), + [](size_t * const array){ delete [] array; } ); size_t * const first_nnz_per_thread_ptr = first_nnz_per_thread.get(); outgoing.resize( data.P ); diff --git a/include/graphblas/descriptors.hpp b/include/graphblas/descriptors.hpp index cd0a5ab25..2c0a793ae 100644 --- a/include/graphblas/descriptors.hpp +++ b/include/graphblas/descriptors.hpp @@ -62,7 +62,20 @@ namespace grb { */ static constexpr Descriptor no_operation = 0; - /** Inverts the mask prior to applying it. */ + /** + * Inverts the mask prior to applying it. + * + * Applying this descriptor to a sparse mask may still only generate output + * where the mask has elements. To decide whether an output will be generated + * at a given element in the mask, its value will be inverted. + * + * If instead the structural complement is to be taken as a mask, this + * descriptor must be combined with #grb::descriptors::structural. + * ALP/GraphBLAS forbids taking the structural inverse of matrix masks + * (because then either the output matrix or the mask matrix has + * \f$ \mathcal{mn} \f$ values, which defeats any useful application of + * GraphBLAS as this signifies one of the containers is, in fact, not sparse). + */ static constexpr Descriptor invert_mask = 1; /** @@ -98,6 +111,11 @@ namespace grb { * i-th index, regardless of how that value evaluates. It evaluates false * if there was no value assigned. * + * These semantics are inverted when this descriptor is combined with + * #grb::descriptors::invert_mask: in that case, the mask evaluates true at + * index \f$ i \f$ if the mask had no value at that index; and evaluates false + * otherwise. + * * @see structural_complement */ static constexpr Descriptor structural = 8; @@ -113,6 +131,10 @@ namespace grb { * This ignores the actual values of the mask argument. The i-th element of * the mask now evaluates true if the mask has \em no value assigned to its * i-th index, and evaluates false otherwise. + * + * The application of this descriptor is forbidden for matrix mask arguments, + * as otherwise either the output or the mask is dense or is (too) close to + * being dense. */ static constexpr Descriptor structural_complement = structural | invert_mask; diff --git a/include/graphblas/hyperdags/hyperdags.hpp b/include/graphblas/hyperdags/hyperdags.hpp index 2ae951565..b4a650c0e 100644 --- a/include/graphblas/hyperdags/hyperdags.hpp +++ b/include/graphblas/hyperdags/hyperdags.hpp @@ -381,6 +381,8 @@ namespace grb { SET_MATRIX_MATRIX_INPUT2, + SET_MATRIX_MATRIX_MASKED, + MXM_MATRIX_MATRIX_MATRIX_SEMIRING, MXM_MATRIX_MATRIX_MATRIX_MONOID, @@ -503,7 +505,7 @@ namespace grb { }; /** \internal How many operation vertex types exist. */ - const constexpr size_t numOperationVertexTypes = 111; + const constexpr size_t numOperationVertexTypes = 112; /** \internal An array of all operation vertex types. */ const constexpr enum OperationVertexType diff --git a/include/graphblas/hyperdags/io.hpp b/include/graphblas/hyperdags/io.hpp index 4ec81917b..99711f0bb 100644 --- a/include/graphblas/hyperdags/io.hpp +++ b/include/graphblas/hyperdags/io.hpp @@ -28,6 +28,10 @@ #include +#ifdef _DEBUG + #define _DEBUG_HYPERDAGS_IO +#endif + namespace grb { @@ -440,6 +444,110 @@ namespace grb { return ret; } + /** + * This function inherits the performance semantics of the underlying backend. + */ + template< + Descriptor descr = descriptors::no_operation, + typename OutputType, typename MaskType, typename InputType, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC set( + Matrix< OutputType, hyperdags, RIT1, CIT1, NIT1 > &C, + const Matrix< MaskType, hyperdags, RIT2, CIT2, NIT2 > &M, + const Matrix< InputType, hyperdags, RIT3, CIT3, NIT3 > &A, + const Phase &phase = EXECUTE + ) { +#ifdef _DEBUG_HYPERDAGS_IO + std::cout << "Entering set(matrix, matrix, matrix), hyperdags backend\n"; +#endif + // first, check for dynamic errors + const size_t m = nrows( C ); + const size_t n = ncols( C ); + if( m != nrows( A ) || n != ncols( A ) ) { +#ifdef _DEBUG_HYPERDAGS_IO + std::cerr << "\t set( matrix, matrix, matrix ): dimension mismatch (I)\n"; +#endif + return MISMATCH; + } + if( m != nrows( M ) && nrows( M ) != 0 ) { +#ifdef _DEBUG_HYPERDAGS_IO + std::cerr << "\t set( matrix, matrix, matrix ): dimension mismatch (II)\n"; +#endif + return MISMATCH; + } + if( n != ncols( M ) && ncols( M ) != 0 ) { +#ifdef _DEBUG_HYPERDAGS_IO + std::cerr << "\t set( matrix, matrix, matrix ): dimension mismatch (III)\n"; +#endif + return MISMATCH; + } + // second, check for trivial op + if( m == 0 || n == 0 ) { +#ifdef _DEBUG_HYPERDAGS_IO + std::cerr << "\t WARNING set( matrix, matrix, matrix ), hyperdags: " + << "trivial op detected (all containers empty). No operation will be " + << "recorded\n"; +#endif + return SUCCESS; + } + // third, execute +#ifdef _DEBUG_HYPERDAGS_IO + std::cerr << "\t set( matrix, matrix, matrix ), hyperdags: forwarding to " + << "execution backend\n"; +#endif + const RC ret = set< descr >( + internal::getMatrix( C ), internal::getMatrix( M ), + internal::getMatrix( A ), phase + ); + // fourth, forward any errors + if( ret != SUCCESS ) { return ret; } + if( phase != EXECUTE ) { return ret; } + // fifth, record operation +#ifdef _DEBUG_HYPERDAGS_IO + std::cerr << "\t set( matrix, matrix, matrix ), hyperdags: execution had no " + << "error; recording operation\n"; +#endif + std::array< const void *, 0 > sourcesP{}; + std::array< uintptr_t, 1 > destinations{ getID( internal::getMatrix(C) ) }; + if( nrows( M ) == 0 || ncols( M ) == 0 ) { +#ifdef _DEBUG_HYPERDAGS_IO + std::cerr << "\t WARNING set( matrix, matrix, matrix ), hyperdags: " + << "empty mask detected; hyperDAG will not (cannot) record it\n"; +#endif + std::array< uintptr_t, 2 > sourcesC{ + getID( internal::getMatrix(A) ), + getID( internal::getMatrix(C) ) + }; + internal::hyperdags::generator.addOperation( + internal::hyperdags::SET_MATRIX_MATRIX_MASKED, + sourcesP.begin(), sourcesP.end(), + sourcesC.begin(), sourcesC.end(), + destinations.begin(), destinations.end() + ); + } else { + std::array< uintptr_t, 3 > sourcesC{ + getID( internal::getMatrix(A) ), + getID( internal::getMatrix(M) ), + getID( internal::getMatrix(C) ) + }; + internal::hyperdags::generator.addOperation( + internal::hyperdags::SET_MATRIX_MATRIX_MASKED, + sourcesP.begin(), sourcesP.end(), + sourcesC.begin(), sourcesC.end(), + destinations.begin(), destinations.end() + ); + } + + // done +#ifdef _DEBUG_HYPERDAGS_IO + std::cerr << "\t set( matrix, matrix, matrix ), hyperdags: exiting\n"; +#endif + return ret; + } + /** * This function inherits the performance semantics of the underlying backend. */ diff --git a/include/graphblas/nonblocking/io.hpp b/include/graphblas/nonblocking/io.hpp index 0831fc564..963846251 100644 --- a/include/graphblas/nonblocking/io.hpp +++ b/include/graphblas/nonblocking/io.hpp @@ -1227,6 +1227,42 @@ namespace grb { internal::getRefMatrix( C ), internal::getRefMatrix( A ), val, phase ); } + + template< + Descriptor descr = descriptors::no_operation, + typename OutputType, typename MaskType, typename InputType, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC set( + Matrix< OutputType, nonblocking, RIT1, CIT1, NIT1 > &C, + const Matrix< MaskType, nonblocking, RIT2, CIT2, NIT2 > &M, + const Matrix< InputType, nonblocking, RIT3, CIT3, NIT3 > &A, + const Phase &phase = EXECUTE + ) noexcept { + if( internal::NONBLOCKING::warn_if_not_native && + config::PIPELINE::warn_if_not_native + ) { + std::cerr << "Warning: masked set (nonblocking) currently delegates " + << "to a blocking implementation.\n" + << " Further similar such warnings will be suppressed.\n"; + internal::NONBLOCKING::warn_if_not_native = false; + } + + // nonblocking execution is not supported + // first, execute any computation that is not completed + internal::le.execution(); + + // second, delegate to the reference backend + return set< descr >( + internal::getRefMatrix( C ), + internal::getRefMatrix( M ), + internal::getRefMatrix( A ), + phase + ); + } + template< Descriptor descr = descriptors::no_operation, typename InputType, diff --git a/include/graphblas/phase.hpp b/include/graphblas/phase.hpp index 1f12b8f05..044395ac9 100644 --- a/include/graphblas/phase.hpp +++ b/include/graphblas/phase.hpp @@ -236,7 +236,7 @@ namespace grb { * execute phase. * * If, instead, the output container capacity was found to be insufficient, - * then the requested operation may return #grb::FAILED, in which case the + * then the requested operation may return #grb::ILLEGAL, in which case the * contents of output containers shall be cleared. * * \note That on failure a primitive called using the execute phase may diff --git a/include/graphblas/reference/blas3.hpp b/include/graphblas/reference/blas3.hpp index 054d945dd..3cffb588a 100644 --- a/include/graphblas/reference/blas3.hpp +++ b/include/graphblas/reference/blas3.hpp @@ -579,432 +579,6 @@ namespace grb { return SUCCESS; } -#ifndef _H_GRB_REFERENCE_OMP_BLAS3 - /** - * Meta-data for global buffer management for use with #grb::mxm. - * - * This meta-data is the same for both the sequential (reference) and shared- - * memory parallel (reference_omp) backends. - * - * This class contains all meta-data necessary to interpret the global buffer - * as an array of sparse accumulators (SPAs). The length of the array is given - * by a call to #threads(), minus one. It is called that since a call to - * #threads() retrieves how many threads can be used to process the call to - * #grb::mxm. - * - * Each SPA has the layout (bitarray, stack, valueArray). These are packed in a - * padded byte array, such that each bit array, stack, and value array is - * aligned on sizeof(int) bytes. - * - * @tparam NIT The nonzero index type. - * @tparam ValueType The output matrix value type. - */ - template< typename NIT, typename ValueType > - class MXM_BufferMetaData { - - static_assert( sizeof(NIT) % sizeof(int) == 0, "Unsupported type for NIT; " - "please submit a bug report!" ); - - private: - - /** The size of the offset array */ - size_t m; - - /** The size of the SPA */ - size_t n; - - /** The number of threads supported during a call to #grb::mxm */ - size_t nthreads; - - /** The initial buffer offset */ - size_t bufferOffset; - - /** The size of a single SPA, including bytes needed for padding */ - size_t paddedSPASize; - - /** The number of bytes to pad the SPA array with */ - size_t arrayShift; - - /** The number of bytes to pad the SPA stack with */ - size_t stackShift; - - /** - * Given a number of used bytes of the buffer, calculate the available - * remainder buffer and return it. - * - * @param[in] osize The size of the buffer (in bytes) that is already - * in use. - * @param[out] remainder Pointer to any remainder buffer. - * @param[out] rsize The size of the remainder buffer. - * - * If no buffer space is left, \a remainder will be set to nullptr - * and \a size to 0. - */ - void retrieveRemainderBuffer( - const size_t osize, - void * &remainder, size_t &rsize - ) const noexcept { - const size_t size = internal::template getCurrentBufferSize< char >(); - char * rem = internal::template getReferenceBuffer< char >( size ); - size_t rsize_calc = size - osize; - rem += osize; - const size_t mod = reinterpret_cast< uintptr_t >(rem) % sizeof(int); - if( mod ) { - const size_t shift = sizeof(int) - mod; - if( rsize_calc >= shift ) { - rsize_calc -= shift; - rem += rsize; - } else { - rsize_calc = 0; - rem = nullptr; - } - } - assert( !(reinterpret_cast< uintptr_t >(rem) % sizeof(int)) ); - // write out - remainder = rem; - rsize = rsize_calc; - } - - - public: - - /** - * Base constructor. - * - * @param[in] _m The length of the offset array. - * @param[in] _n The length of the SPA. - * @param[in] max_threads The maximum number of threads. - * - * \note \a max_threads is a separate input since there might be a need to - * cap the maximum number of threads used based on some analytic - * performance model. Rather than putting such a performance model - * within this class, we make it an obligatory input parameter - * instead. - * - * \note It is always valid to pass config::OMP::threads(). - * - * \note This class \em will, however, cap the number of threads returned - * to \a _n. - */ - MXM_BufferMetaData( - const size_t _m, const size_t _n, - const size_t max_threads - ) : m( _m ), n( _n ), arrayShift( 0 ), stackShift( 0 ) { - #ifdef _DEBUG_REFERENCE_BLAS3 - #pragma omp critical - std::cout << "\t\t\t computing padded buffer size for a SPA of length " - << n << " while leaving space for an additional offset buffer of length " - << std::max( m, n ) << "...\n"; - #endif - // compute bufferOffset - bufferOffset = (std::max( m, n ) + 1) * sizeof( NIT ); - - // compute value buffer size - const size_t valBufSize = n * sizeof( ValueType ); - - #ifdef _DEBUG_REFERENCE_BLAS3 - std::cout << "\t\t\t\t bit-array size has byte-size " << - internal::Coordinates< reference >::arraySize( n ) << "\n"; - std::cout << "\t\t\t\t stack has byte-size " << - internal::Coordinates< reference >::stackSize( n ) << "\n"; - std::cout << "\t\t\t\t value buffer has byte-size " << valBufSize << "\n"; - #endif - - // compute paddedSPASize - paddedSPASize = - internal::Coordinates< reference >::arraySize( n ) + - internal::Coordinates< reference >::stackSize( n ) + - valBufSize; - size_t shift = - internal::Coordinates< reference >::arraySize( n ) % sizeof(int); - if( shift != 0 ) { - arrayShift = sizeof(int) - shift; - paddedSPASize += arrayShift; - } - shift = internal::Coordinates< reference >::stackSize( n ) % sizeof(int); - if( shift != 0 ) { - stackShift = sizeof(int) - shift; - paddedSPASize += stackShift; - } - shift = valBufSize % sizeof(int); - if( shift != 0 ) { - paddedSPASize += (sizeof(int) - shift); - } - - // pad bufferOffset - shift = bufferOffset % sizeof(int); - if( shift != 0 ) { - bufferOffset += (sizeof(int) - shift); - } - - // compute free buffer size - const size_t freeBufferSize = internal::getCurrentBufferSize< char >() - - bufferOffset; - - // compute max number of threads - nthreads = 1 + freeBufferSize / paddedSPASize; - #ifdef _DEBUG_REFERENCE_BLAS3 - #pragma omp critical - std::cout << "\t\t\t free buffer size: " << freeBufferSize - << ", (padded) SPA size: " << paddedSPASize - << " -> supported #threads: " << nthreads << ". " - << " The shifts for the bit-array and the stack are " << arrayShift - << ", respectively, " << stackShift << "." - << "\n"; - #endif - // cap the final number of selected threads - if( nthreads > max_threads ) { - nthreads = max_threads; - } - if( nthreads > n ) { - nthreads = n; - } - } - - /** @returns The maximum number of supported threads during #grb::mxm */ - size_t threads() const noexcept { - return nthreads; - } - - /** - * Requests and returns a global buffer required for a thread-local SPA. - * - * @param[in] t The thread ID. Must be larger than 0. - * - * \note Thread 0 employs the SPA allocated with the output matrix. - * - * @returns Pointer into the global buffer starting at the area reserved for - * the SPA of thread \a t. - */ - char * getSPABuffers( size_t t ) const noexcept { - assert( t > 0 ); - (void) --t; - char * raw = internal::template getReferenceBuffer< char >( - bufferOffset + nthreads * paddedSPASize ); - assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 ); - raw += bufferOffset; - assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 ); - raw += t * paddedSPASize; - return raw; - } - - /** - * Retrieves the column offset buffer. - * - * @param[out] remainder Returns any remainder buffer beyond that of the row - * offset buffer. - * @param[out] rsize The remainder buffer size \a remainder points to. - * - * If \a remainder is not a nullptr then neither should \a rsize, - * and vice versa. - * - * Retrieving any remainder buffer is optional. The default is to not ask - * for them. - * - * \warning If all buffer memory is used for the column offsets, it may be - * that \a remainder equals nullptr and rsize - * zero. - * - * \warning This buffer is only guaranteed exclusive if only the retrieved - * column buffer is used. In particular, if also requesting (and - * using) SPA buffers, the remainder buffer area is shared with - * those SPA buffers, and data races are likely to occur. In other - * words: be very careful with any use of these remainder buffers. - * - * @returns The column offset buffer. - * - * \warning This buffer overlaps with the CRS offset buffer. The caller - * must ensure to only ever use one at a time. - */ - NIT * getColOffsetBuffer( - void * * const remainder = nullptr, - size_t * const rsize = nullptr - ) const noexcept { - NIT * const ret = internal::template getReferenceBuffer< NIT >( n + 1 ); - if( remainder != nullptr || rsize != nullptr ) { - assert( remainder != nullptr && rsize != nullptr ); - retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize ); - } - return ret; - } - - /** - * Retrieves the row offset buffer. - * - * @param[out] remainder Returns any remainder buffer beyond that of the row - * offset buffer. - * @param[out] rsize The remainder buffer size \a remainder points to. - * - * If \a remainder is not a nullptr then neither should \a rsize, - * and vice versa. - * - * Retrieving any remainder buffer is optional. The default is to not ask - * for them. - * - * \warning If all buffer memory is used for the row offsets, it may be that - * \a remainder equals nullptr and rsize zero. - * - * \warning This buffer is only guaranteed exclusive if only the retrieved - * row buffer is used. In particular, if also requesting (and - * using) SPA buffers, the remainder buffer area is shared with - * those SPA buffers, and data races are likely to occur. In other - * words: be very careful with any use of these remainder buffers. - * - * @returns The row offset buffer. - * - * \warning This buffer overlaps with the CCS offset buffer. The caller - * must ensure to only ever use one at a time. - */ - NIT * getRowOffsetBuffer( - void * * const remainder = nullptr, - size_t * const rsize = nullptr - ) const noexcept { - NIT * const ret = internal::template getReferenceBuffer< NIT >( m + 1 ); - if( remainder != nullptr || rsize != nullptr ) { - assert( remainder != nullptr && rsize != nullptr ); - retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize ); - } - return ret; - } - - /** - * Shifts a pointer into the global buffer by the bit-array size and its - * padding. - * - * @param[in,out] raw On input: an aligned pointer into the global buffer. - * On output: an aligned pointer past the bit-array - * position. - */ - void applyArrayShift( char * &raw ) const noexcept { - const size_t totalShift = - internal::Coordinates< reference >::arraySize( n ) + - arrayShift; - #ifdef _DEBUG_REFERENCE_BLAS3 - std::cout << "\t\t\t shifting input pointer with " - << internal::Coordinates< reference >::arraySize( n ) << " + " - << arrayShift << " = " << totalShift << "bytes \n"; - #endif - raw += totalShift; - } - - /** - * Shifts a pointer into the global buffer by the stack size and its - * padding. - * - * @param[in,out] raw On input: an aligned pointer into the global buffer. - * On output: an aligned pointer past the stack position. - */ - void applyStackShift( char * &raw ) const noexcept { - const size_t totalShift = - internal::Coordinates< reference >::stackSize( n ) + - stackShift; - #ifdef _DEBUG_REFERENCE_BLAS3 - std::cout << "\t\t\t shifting input pointer with " - << internal::Coordinates< reference >::arraySize( n ) << " + " - << stackShift << " = " << totalShift << "bytes \n"; - #endif - raw += totalShift; - } - - }; -#endif - - /** - * Retrieves the SPA buffers for the calling thread. - * - * \warning This function must be called from within an OpenMP parallel - * section. - * - * @param[out] arr Where the bit-array may be located. - * @param[out] buf Where the stack may be located. - * @param[out] valbuf Where the value buffer may be located. - * - * All above pointers are aligned on sizeof(int) bytes. - * - * @param[in] md Meta-data for global buffer management. - * @param[in] C The output matrix. - * - * One thread uses the buffers pre-allocated with the matrix \a C, thus - * ensuring at least one thread may perform the #grb::mxm. Any remainder - * threads can only help process the #grb::mxm if there is enough global - * buffer memory available. - * - * - * \note The global memory has size \f$ \Omega( \mathit{nz} ) \f$, which may - * be several factors (or even asymptotically greater than) - * \f$ \max\{ m, n \} \f$. - * - * \note In case the application stores multiple matrices, the global buffer - * may additionally be greater than the above note indicates if at least - * one of the other matrices is significantly (or asymptotically) larger - * than the one involved with the #grb::mxm. - */ - template< - typename OutputType, - typename RIT, typename CIT, typename NIT - > - void mxm_ompPar_getSPABuffers( - char * &arr, char * &buf, OutputType * &valbuf, - const struct MXM_BufferMetaData< NIT, OutputType > &md, - Matrix< OutputType, reference, RIT, CIT, NIT > &C - ) { -#ifdef _H_GRB_REFERENCE_OMP_BLAS3 - // other threads use the global buffer to create additional SPAs - { - const size_t t = config::OMP::current_thread_ID(); - #ifndef NDEBUG - const size_t T = config::OMP::current_threads(); - assert( t < T ); - #endif - if( t > 0 ) { - #ifdef _DEBUG_REFERENCE_BLAS3 - #pragma omp critical - std::cout << "\t Thread " << t << " gets buffers from global buffer\n"; - #endif - char * rawBuffer = md.getSPABuffers( t ); - assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 ); - arr = rawBuffer; - #ifdef _DEBUG_REFERENCE_BLAS3 - #pragma omp critical - #endif - md.applyArrayShift( rawBuffer ); - assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 ); - buf = rawBuffer; - #ifdef _DEBUG_REFERENCE_BLAS3 - #pragma omp critical - #endif - md.applyStackShift( rawBuffer ); - assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 ); - assert( buf != arr ); - valbuf = reinterpret_cast< OutputType * >(rawBuffer); - assert( static_cast< void * >(valbuf) != static_cast< void * >(buf) ); - } else { - #ifdef _DEBUG_REFERENCE_BLAS3 - #pragma omp critical - std::cout << "\t Thread " << t << " gets buffers from matrix storage\n"; - #endif - // one thread uses the standard matrix buffer - internal::getMatrixBuffers( arr, buf, valbuf, 1, C ); - } - #ifdef _DEBUG_REFERENCE_BLAS3 - #pragma omp critical - { - std::cout << "\t Thread " << t << " has SPA array @ " - << static_cast< void * >( arr ) << " and SPA stack @ " - << static_cast< void * >( buf ) << " and SPA values @ " - << static_cast< void * >( valbuf ) << "\n"; - } - #endif - } -#else - #ifdef _DEBUG_REFERENCE_BLAS3 - std::cout << "\t Reference backend gets buffers from global buffer\n"; - #endif - internal::getMatrixBuffers( arr, buf, valbuf, 1, C ); - (void) md; -#endif - } - /** * Given a computed new row_start array, moves the old index and value arrays * to new offsets. This leaves precisely enough space for the mxm algorithm to @@ -1215,6 +789,7 @@ namespace grb { // a basic analytic model based on the number of nonzeroes size_t max_threads = config::OMP::threads(); + assert( max_threads > 0 ); { size_t target_nnz = 0; if( phase == EXECUTE ) { @@ -1226,7 +801,7 @@ namespace grb { const size_t nnz_based_nthreads = target_nnz / config::CACHE_LINE_SIZE::value(); if( nnz_based_nthreads < max_threads ) { - max_threads = nnz_based_nthreads; + max_threads = nnz_based_nthreads > 0 ? nnz_based_nthreads : 1; } #ifdef _DEBUG_REFERENCE_BLAS3 std::cout << "\t simple analytic model selects max threads of " @@ -1234,7 +809,7 @@ namespace grb { #endif } - MXM_BufferMetaData< NIT, OutputType > bufferMD( m, n, max_threads ); + SPA_BufferMetaData< NIT, OutputType > bufferMD( m, n, max_threads ); #ifdef _H_GRB_REFERENCE_OMP_BLAS3 // derive number of threads @@ -1255,7 +830,7 @@ namespace grb { char * arr = nullptr; char * buf = nullptr; OutputType * valbuf = nullptr; - mxm_ompPar_getSPABuffers( arr, buf, valbuf, bufferMD, C ); + spa_ompPar_getBuffers( arr, buf, valbuf, bufferMD, C ); // do count size_t local_nzc; @@ -1296,7 +871,7 @@ namespace grb { char * arr = nullptr; char * buf = nullptr; OutputType * valbuf = nullptr; - mxm_ompPar_getSPABuffers( arr, buf, valbuf, bufferMD, C ); + spa_ompPar_getBuffers( arr, buf, valbuf, bufferMD, C ); #ifdef _DEBUG_REFERENCE_BLAS3 #ifdef _H_GRB_REFERENCE_OMP_BLAS3 #pragma omp critical diff --git a/include/graphblas/reference/compressed_storage.hpp b/include/graphblas/reference/compressed_storage.hpp index 95b67a903..2d40d18cc 100644 --- a/include/graphblas/reference/compressed_storage.hpp +++ b/include/graphblas/reference/compressed_storage.hpp @@ -1156,7 +1156,7 @@ namespace grb { {} /** Move constructor. */ - Compressed_Storage< void, IND, SIZE >( SelfType &&other ) { + Compressed_Storage( SelfType &&other ) { moveFromOther( other ); } diff --git a/include/graphblas/reference/io.hpp b/include/graphblas/reference/io.hpp index f4f1b0709..b4af46eea 100644 --- a/include/graphblas/reference/io.hpp +++ b/include/graphblas/reference/io.hpp @@ -951,14 +951,15 @@ namespace grb { ); static_assert( ( !(descr & descriptors::no_casting) || - ( A_is_mask && std::is_same< InputType2, OutputType >::value ) ), + ( A_is_mask && std::is_same< InputType2, OutputType >::value ) || + ( !A_is_mask && std::is_same< InputType1, OutputType >::value ) ), "grb::internal::set_copy called with non-matching value types. This is an " "internal error. Please submit a bug report." ); static_assert( - !(descr & descriptors::invert_mask), "internal::grb::set_copy called with " - "the invert_mask descriptor. This is an internal error; please submit a " - "bug report." + !A_is_mask || !(descr & descriptors::invert_mask), + "internal::grb::set_copy called with the invert_mask descriptor. This is " + "an internal error; please submit a bug report." ); // run-time checks @@ -2028,6 +2029,274 @@ namespace grb { } } + template< + Descriptor descr = descriptors::no_operation, + typename OutputType, typename MaskType, typename InputType, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC set( + Matrix< OutputType, reference, RIT1, CIT1, NIT1 > &C, + const Matrix< MaskType, reference, RIT2, CIT2, NIT2 > &M, + const Matrix< InputType, reference, RIT3, CIT3, NIT3 > &A, + const Phase &phase = EXECUTE + ) noexcept { + // static checks + static_assert( + !std::is_void< InputType >::value || + std::is_void< OutputType >::value, "grb::set( masked set to matrix ): " + "cannot have a pattern matrix as input unless the output is also a pattern " + "matrix" + ); + static_assert( + std::is_convertible< InputType, OutputType >::value || + std::is_void< OutputType >::value, + "grb::set (masked set to matrix): input type cannot be " + "converted to output type" + ); + static_assert( + !(descr & descriptors::structural && (descr & descriptors::invert_mask)), + "grb::set (masked set to matrix) may not be called with both the structural " + "and invert_mask descriptors set" + ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, OutputType >::value + ), "grb::set", + "called with non-matching value types" + ); + + // dynamic checks +#ifdef _DEBUG_REFERENCE_IO + std::cout << "Called grb::set (matrix-to-matrix-masked, reference)\n"; +#endif + assert( phase != TRY ); + const size_t nrows = grb::nrows( C ); + const size_t ncols = grb::ncols( C ); + const size_t m = grb::nrows( M ); + const size_t n = grb::ncols( M ); + + // check for trivial dispatch first (otherwise the below checks fail when they + // should not) + if( m == 0 || n == 0 ) { +#ifdef _DEBUG_REFERENCE_IO + std::cout << "\t delegating to unmasked matrix-to-matrix set, reference\n"; +#endif + // If the mask is empty, ignore it + return set< descr >( C, A, phase ); + } + + // dynamic checks, continued + if( nrows != grb::nrows( A ) || nrows != m ) { + return MISMATCH; + } + + if( ncols != grb::ncols( A ) || ncols != n ) { + return MISMATCH; + } + + // go for implementation, preliminaries: + size_t nzc = 0; + const auto &A_raw = internal::getCRS( A ); + const auto &mask_raw = internal::getCRS( M ); + // we now have one (guaranteed) SPA, which is mask_coors. We now are going to + // check how many more SPAs ideally we would like (for reference_omp), and + // then go about trying to get those. If, finally, we get just this one SPA, + // we will go into this mostly-sequential code (essentially, big-Omega nrows): +#ifdef _H_GRB_REFERENCE_OMP_IO + const size_t nnz_based_nthreads = std::max( config::OMP::threads(), + grb::nnz( A ) / config::CACHE_LINE_SIZE::value() ); + grb::internal::SPA_BufferMetaData< NIT1, OutputType > bufferMD( m, n, + nnz_based_nthreads ); + const size_t nthreads = bufferMD.threads(); + std::cout << "\t set( matrix, matrix, matrix ) will use " << nthreads + << " threads\n"; +#else + const size_t nthreads = 1; +#endif + if( nthreads == 1 ) { + char * arr = nullptr; + char * buf = nullptr; + OutputType * valbuf = nullptr; + internal::Coordinates< reference > coors; + internal::getMatrixBuffers( arr, buf, valbuf, 1, C ); + coors.set( arr, false, buf, ncols ); + for( size_t i = 0; i < nrows; ++i ) { + coors.clear(); + for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = mask_raw.row_index[ k ]; + if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) { + coors.assign( k_col ); + } + } +#ifdef _H_GRB_REFERENCE_OMP_IO + #pragma omp parallel for reduction( +: nzc ) \ + schedule( dynamic, config::CACHE_LINE_SIZE::value() ) +#endif + for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = A_raw.row_index[ k ]; + if( coors.assigned( k_col ) ) { + (void) ++nzc; + } + } + } + } else { +#ifdef _H_GRB_REFERENCE_OMP_IO + #pragma omp parallel num_threads( nthreads ) reduction( +: nzc ) + { + // get thread-local buffers + size_t local_nz = 0; + char * arr = nullptr; + char * buf = nullptr; + OutputType * valbuf = nullptr; + internal::spa_ompPar_getBuffers( arr, buf, valbuf, bufferMD, C ); + internal::Coordinates< reference > coors; + coors.set_seq( arr, false, buf, n ); + // follow dynamic schedule since we cannot predict sparsity structure + #pragma omp for schedule( dynamic, config::CACHE_LINE_SIZE::value() ) + for( size_t i = 0; i < nrows; ++i ) { + coors.clear(); + for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = mask_raw.row_index[ k ]; + if( utils::interpretMatrixMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) { + coors.assign( k_col ); + } + } + for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = A_raw.row_index[ k ]; + if( coors.assigned( k_col ) ) { + (void) ++local_nz; + } + } + } + nzc += local_nz; + } +#else + const bool code_path_should_not_be_reached = false; + std::cerr << "\t logic error in grb::set( matrix, matrix, matrix ): " + << "code path should not reach here. Please submit a bug report\n"; + assert( code_path_should_not_be_reached ); + #ifdef NDEBUG + (void) code_path_should_not_be_reached; + #endif +#endif + } + + // we now have a count. If we're in the resize phase that means we're done: + if( phase == RESIZE ) { + return resize( C, nzc ); + } + + // otherwise, we now compute the output. We start with checking capacity + assert( phase == EXECUTE ); + if( capacity( C ) < nzc ) { +#ifdef _DEBUG + std::cout << "\t insufficient capacity to complete " + "requested masked set matrix to matrix computation\n"; +#endif + const RC clear_rc = clear( C ); + if( clear_rc != SUCCESS ) { + return PANIC; + } else { + return ILLEGAL; + } + } + + // get output CRS and CCS structures + // TODO: check for crs_only descriptor + auto &CRS_raw = internal::getCRS( C ); + auto &CCS_raw = internal::getCCS( C ); + config::NonzeroIndexType * C_col_index = internal::template + getReferenceBuffer< typename config::NonzeroIndexType >( ncols + 1 ); + CRS_raw.col_start[ 0 ] = 0; + +#ifdef _H_GRB_REFERENCE_OMP_IO + // TODO ALPify the below + #pragma omp parallel for simd +#endif + for( size_t j = 0; j <= ncols; ++j ) { + CCS_raw.col_start[ j ] = 0; + C_col_index[ j ] = 0; + } + + //TODO: revise the below, WIP + char * arr = nullptr; + char * buf = nullptr; + MaskType * valbuf = nullptr; + internal::Coordinates< reference > coors; + internal::getMatrixBuffers( arr, buf, valbuf, 1, M ); + coors.set( arr, false, buf, ncols ); + + // do counting sort, phase 1 -- also this loop should employ the same + // parallelisation strategy during counting + nzc = 0; + for( size_t i = 0; i < nrows; ++i ) { + coors.clear(); + for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = mask_raw.row_index[ k ]; + if( utils::interpretMask< descr, MaskType >( true, mask_raw.getValues(), k ) ) { + coors.assign( k_col ); + } + } + for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = A_raw.row_index[ k ]; + if( coors.assigned( k_col ) ) { + (void) nzc++; + (void) (CCS_raw.col_start[ k_col + 1 ])++; + } + } + CRS_raw.col_start[ i + 1 ] = nzc; + } + + // TODO this is a prefix sum -- use the OMP utility function here to + // parallelise it + for( size_t j = 1; j < ncols; ++j ) { + CCS_raw.col_start[ j + 1 ] += CCS_raw.col_start[ j ]; + } + + // do counting sort, phase 2 -- use previously computed CCS offset array to + // update CCS during the computational phase. Also this loop should employ + // the same (multiple-SPA) parallelisation strategy as above + nzc = 0; + for( size_t i = 0; i < nrows; ++i ) { + coors.clear(); + for( auto k = mask_raw.col_start[ i ]; k < mask_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = mask_raw.row_index[ k ]; + if( utils::interpretMatrixMask< descr, MaskType >( + true, mask_raw.getValues(), k ) + ) { + coors.assign( k_col ); + } + } + for( auto k = A_raw.col_start[ i ]; k < A_raw.col_start[ i + 1 ]; ++k ) { + const auto k_col = A_raw.row_index[ k ]; + if( coors.assigned( k_col ) ) { + constexpr int zero = 0; + CRS_raw.row_index[ nzc ] = k_col; + CRS_raw.setValue( nzc, A_raw.getValue( k, zero ) ); + const size_t CCS_index = C_col_index[ k_col ] + CCS_raw.col_start[ k_col ]; + (void) C_col_index[ k_col ]++; + CCS_raw.row_index[ CCS_index ] = i; + CCS_raw.setValue( CCS_index, A_raw.getValue( k, zero ) ); + (void) nzc++; + } + } + } +#ifndef NDEBUG + #ifdef _H_GRB_REFERENCE_OMP_BLAS3 + #pragma omp parallel schedule( static, config::CACHE_LINE_SIZE::value() ) + #endif + for( size_t j = 0; j < ncols; ++j ) { + assert( CCS_raw.col_start[ j + 1 ] - CCS_raw.col_start[ j ] == + C_col_index[ j ] ); + } +#endif + internal::setCurrentNonzeroes( C, nzc ); + + // done + return SUCCESS; + } + /** * Ingests raw data into a GraphBLAS vector. * diff --git a/include/graphblas/reference/matrix.hpp b/include/graphblas/reference/matrix.hpp index c1b3a365d..84873a22c 100644 --- a/include/graphblas/reference/matrix.hpp +++ b/include/graphblas/reference/matrix.hpp @@ -1283,7 +1283,9 @@ namespace grb { "We hit here a configuration border case which the implementation does not " "handle at present. Please submit a bug report." ); - // compute and return + // compute and return the maximum of + // - row- and column-wise buffers. The added factor two is for padding + // - minimal buffer requirement for parallel buildMatrixUnique return std::max( (m + n + 2) * globalBufferUnitSize, #ifdef _H_GRB_REFERENCE_OMP_MATRIX config::OMP::threads() * config::CACHE_LINE_SIZE::value() * @@ -2364,7 +2366,434 @@ namespace grb { ); return ret; } + + /** + * Meta-data for global buffer management for use with #grb::mxm and the + * matrix-matrix-matrix variant of #grb::set. + * + * This class contains all meta-data necessary to interpret the global buffer + * as an array of sparse accumulators (SPAs). The length of the array is given + * by a call to #threads(), minus one. It is called that since a call to + * #threads() retrieves how many threads can be used to process the call to + * #grb::mxm. + * + * Each SPA has the layout (bitarray, stack, valueArray). These are packed in a + * padded byte array, such that each bit array, stack, and value array is + * aligned on sizeof(int) bytes. + * + * @tparam NIT The nonzero index type. + * @tparam ValueType The output matrix value type. + * + * This meta-data class applies to both the sequential (reference) and shared- + * memory parallel (reference_omp) backends. + */ + template< typename NIT, typename ValueType > + class SPA_BufferMetaData { + + static_assert( sizeof(NIT) % sizeof(int) == 0, "Unsupported type for NIT; " + "please submit a bug report!" ); + + private: + + /** The size of the offset array */ + size_t m; + + /** The size of the SPA */ + size_t n; + + /** The number of threads supported during a call to #grb::mxm */ + size_t nthreads; + + /** The initial buffer offset */ + size_t bufferOffset; + + /** The size of a single SPA, including bytes needed for padding */ + size_t paddedSPASize; + + /** The number of bytes to pad the SPA array with */ + size_t arrayShift; + + /** The number of bytes to pad the SPA stack with */ + size_t stackShift; + + /** + * Given a number of used bytes of the buffer, calculate the available + * remainder buffer and return it. + * + * @param[in] osize The size of the buffer (in bytes) that is already + * in use. + * @param[out] remainder Pointer to any remainder buffer. + * @param[out] rsize The size of the remainder buffer. + * + * If no buffer space is left, \a remainder will be set to nullptr + * and \a size to 0. + */ + void retrieveRemainderBuffer( + const size_t osize, + void * &remainder, size_t &rsize + ) const noexcept { + const size_t size = internal::template getCurrentBufferSize< char >(); + char * rem = internal::template getReferenceBuffer< char >( size ); + size_t rsize_calc = size - osize; + rem += osize; + const size_t mod = reinterpret_cast< uintptr_t >(rem) % sizeof(int); + if( mod ) { + const size_t shift = sizeof(int) - mod; + if( rsize_calc >= shift ) { + rsize_calc -= shift; + rem += rsize; + } else { + rsize_calc = 0; + rem = nullptr; + } + } + assert( !(reinterpret_cast< uintptr_t >(rem) % sizeof(int)) ); + // write out + remainder = rem; + rsize = rsize_calc; + } + + + public: + + /** + * Base constructor. + * + * @param[in] _m The length of the offset array. + * @param[in] _n The length of the SPA. + * @param[in] max_threads The maximum number of threads. + * + * \note \a max_threads is a separate input since there might be a need to + * cap the maximum number of threads used based on some analytic + * performance model. Rather than putting such a performance model + * within this class, we make it an obligatory input parameter + * instead. + * + * \note It is always valid to pass config::OMP::threads(). + * + * \note This class \em will, however, cap the number of threads returned + * to \a _n. + */ + SPA_BufferMetaData( + const size_t _m, const size_t _n, + const size_t max_threads + ) : m( _m ), n( _n ), arrayShift( 0 ), stackShift( 0 ) { + #ifdef _DEBUG_REFERENCE_MATRIX + #pragma omp critical + std::cout << "\t\t\t computing padded buffer size for a SPA of length " + << n << " while leaving space for an additional offset buffer of length " + << std::max( m, n ) << "...\n"; + #endif + // compute bufferOffset + bufferOffset = (std::max( m, n ) + 1) * sizeof( NIT ); + + // compute value buffer size + const size_t valBufSize = n * utils::SizeOf< ValueType >::value; + + #ifdef _DEBUG_REFERENCE_MATRIX + std::cout << "\t\t\t\t bit-array size has byte-size " << + internal::Coordinates< reference >::arraySize( n ) << "\n"; + std::cout << "\t\t\t\t stack has byte-size " << + internal::Coordinates< reference >::stackSize( n ) << "\n"; + std::cout << "\t\t\t\t value buffer has byte-size " << valBufSize << "\n"; + #endif + + // compute paddedSPASize + paddedSPASize = + internal::Coordinates< reference >::arraySize( n ) + + internal::Coordinates< reference >::stackSize( n ) + + valBufSize; + size_t shift = + internal::Coordinates< reference >::arraySize( n ) % sizeof(int); + if( shift != 0 ) { + arrayShift = sizeof(int) - shift; + paddedSPASize += arrayShift; + } + shift = internal::Coordinates< reference >::stackSize( n ) % sizeof(int); + if( shift != 0 ) { + stackShift = sizeof(int) - shift; + paddedSPASize += stackShift; + } + shift = valBufSize % sizeof(int); + if( shift != 0 ) { + paddedSPASize += (sizeof(int) - shift); + } + + // pad bufferOffset + shift = bufferOffset % sizeof(int); + if( shift != 0 ) { + bufferOffset += (sizeof(int) - shift); + } + + // compute free buffer size + const size_t freeBufferSize = internal::getCurrentBufferSize< char >() - + bufferOffset; + + // compute max number of threads + nthreads = 1 + freeBufferSize / paddedSPASize; + #ifdef _DEBUG_REFERENCE_MATRIX + #pragma omp critical + std::cout << "\t\t\t free buffer size: " << freeBufferSize + << ", (padded) SPA size: " << paddedSPASize + << ", bufferOffset: " << bufferOffset + << " -> supported #threads: " << nthreads << ". " + << " The shifts for the bit-array and the stack are " << arrayShift + << ", respectively, " << stackShift << "." + << "\n"; + #endif + // cap the final number of selected threads + if( nthreads > max_threads ) { + nthreads = max_threads; + } + if( nthreads > n ) { + nthreads = n; + } + } + + /** @returns The maximum number of supported threads during #grb::mxm */ + size_t threads() const noexcept { + return nthreads; + } + + /** + * Requests and returns a global buffer required for a thread-local SPA. + * + * @param[in] t The thread ID. Must be larger than 0. + * + * \note Thread 0 employs the SPA allocated with the output matrix. + * + * @returns Pointer into the global buffer starting at the area reserved for + * the SPA of thread \a t. + */ + char * getSPABuffers( size_t t ) const noexcept { + assert( t > 0 ); + assert( nthreads > 1 ); + (void) --t; + char * raw = internal::template getReferenceBuffer< char >( + bufferOffset + (nthreads - 1) * paddedSPASize ); + assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 ); + raw += bufferOffset; + assert( reinterpret_cast< uintptr_t >(raw) % sizeof(int) == 0 ); + raw += t * paddedSPASize; + return raw; + } + + /** + * Retrieves the column offset buffer. + * + * @param[out] remainder Returns any remainder buffer beyond that of the row + * offset buffer. + * @param[out] rsize The remainder buffer size \a remainder points to. + * + * If \a remainder is not a nullptr then neither should \a rsize, + * and vice versa. + * + * Retrieving any remainder buffer is optional. The default is to not ask + * for them. + * + * \warning If all buffer memory is used for the column offsets, it may be + * that \a remainder equals nullptr and rsize + * zero. + * + * \warning This buffer is only guaranteed exclusive if only the retrieved + * column buffer is used. In particular, if also requesting (and + * using) SPA buffers, the remainder buffer area is shared with + * those SPA buffers, and data races are likely to occur. In other + * words: be very careful with any use of these remainder buffers. + * + * @returns The column offset buffer. + * + * \warning This buffer overlaps with the CRS offset buffer. The caller + * must ensure to only ever use one at a time. + */ + NIT * getColOffsetBuffer( + void * * const remainder = nullptr, + size_t * const rsize = nullptr + ) const noexcept { + NIT * const ret = internal::template getReferenceBuffer< NIT >( n + 1 ); + if( remainder != nullptr || rsize != nullptr ) { + assert( remainder != nullptr && rsize != nullptr ); + retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize ); + } + return ret; + } + + /** + * Retrieves the row offset buffer. + * + * @param[out] remainder Returns any remainder buffer beyond that of the row + * offset buffer. + * @param[out] rsize The remainder buffer size \a remainder points to. + * + * If \a remainder is not a nullptr then neither should \a rsize, + * and vice versa. + * + * Retrieving any remainder buffer is optional. The default is to not ask + * for them. + * + * \warning If all buffer memory is used for the row offsets, it may be that + * \a remainder equals nullptr and rsize zero. + * + * \warning This buffer is only guaranteed exclusive if only the retrieved + * row buffer is used. In particular, if also requesting (and + * using) SPA buffers, the remainder buffer area is shared with + * those SPA buffers, and data races are likely to occur. In other + * words: be very careful with any use of these remainder buffers. + * + * @returns The row offset buffer. + * + * \warning This buffer overlaps with the CCS offset buffer. The caller + * must ensure to only ever use one at a time. + */ + NIT * getRowOffsetBuffer( + void * * const remainder = nullptr, + size_t * const rsize = nullptr + ) const noexcept { + NIT * const ret = internal::template getReferenceBuffer< NIT >( m + 1 ); + if( remainder != nullptr || rsize != nullptr ) { + assert( remainder != nullptr && rsize != nullptr ); + retrieveRemainderBuffer( (m + 1) * sizeof(NIT), *remainder, *rsize ); + } + return ret; + } + + /** + * Shifts a pointer into the global buffer by the bit-array size and its + * padding. + * + * @param[in,out] raw On input: an aligned pointer into the global buffer. + * On output: an aligned pointer past the bit-array + * position. + */ + void applyArrayShift( char * &raw ) const noexcept { + const size_t totalShift = + internal::Coordinates< reference >::arraySize( n ) + + arrayShift; + #ifdef _DEBUG_REFERENCE_MATRIX + std::cout << "\t\t\t shifting input pointer with " + << internal::Coordinates< reference >::arraySize( n ) << " + " + << arrayShift << " = " << totalShift << "bytes \n"; + #endif + raw += totalShift; + } + + /** + * Shifts a pointer into the global buffer by the stack size and its + * padding. + * + * @param[in,out] raw On input: an aligned pointer into the global buffer. + * On output: an aligned pointer past the stack position. + */ + void applyStackShift( char * &raw ) const noexcept { + const size_t totalShift = + internal::Coordinates< reference >::stackSize( n ) + + stackShift; + #ifdef _DEBUG_REFERENCE_MATRIX + std::cout << "\t\t\t shifting input pointer with " + << internal::Coordinates< reference >::arraySize( n ) << " + " + << stackShift << " = " << totalShift << "bytes \n"; + #endif + raw += totalShift; + } + + }; +#endif + + /** + * Retrieves the SPA buffers for the calling thread. + * + * \warning This function must be called from within an OpenMP parallel + * section. + * + * @param[out] arr Where the bit-array may be located. + * @param[out] buf Where the stack may be located. + * @param[out] valbuf Where the value buffer may be located. + * + * All above pointers are aligned on sizeof(int) bytes. + * + * @param[in] md Meta-data for global buffer management. + * @param[in] C The output matrix. + * + * One thread uses the buffers pre-allocated with the matrix \a C, thus + * ensuring at least one thread may perform the #grb::mxm. Any remainder + * threads can only help process the #grb::mxm if there is enough global + * buffer memory available. + * + * + * \note The global memory has size \f$ \Omega( \mathit{nz} ) \f$, which may + * be several factors (or even asymptotically greater than) + * \f$ \max\{ m, n \} \f$. + * + * \note In case the application stores multiple matrices, the global buffer + * may additionally be greater than the above note indicates if at least + * one of the other matrices is significantly (or asymptotically) larger + * than the one involved with the #grb::mxm. + */ + template< + typename OutputType, + typename RIT, typename CIT, typename NIT + > + void spa_ompPar_getBuffers( + char * &arr, char * &buf, OutputType * &valbuf, + const struct SPA_BufferMetaData< NIT, OutputType > &md, + Matrix< OutputType, reference, RIT, CIT, NIT > &C + ) { +#ifdef _H_GRB_REFERENCE_OMP_MATRIX + // other threads use the global buffer to create additional SPAs + { + const size_t t = config::OMP::current_thread_ID(); + #ifndef NDEBUG + const size_t T = config::OMP::current_threads(); + assert( t < T ); + #endif + if( t > 0 ) { + #ifdef _DEBUG_REFERENCE_MATRIX + #pragma omp critical + std::cout << "\t Thread " << t << " gets buffers from global buffer\n"; + #endif + char * rawBuffer = md.getSPABuffers( t ); + assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 ); + arr = rawBuffer; + #ifdef _DEBUG_REFERENCE_MATRIX + #pragma omp critical + #endif + md.applyArrayShift( rawBuffer ); + assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 ); + buf = rawBuffer; + #ifdef _DEBUG_REFERENCE_MATRIX + #pragma omp critical + #endif + md.applyStackShift( rawBuffer ); + assert( reinterpret_cast< uintptr_t >(rawBuffer) % sizeof(int) == 0 ); + assert( buf != arr ); + valbuf = reinterpret_cast< OutputType * >(rawBuffer); + assert( static_cast< void * >(valbuf) != static_cast< void * >(buf) ); + } else { + #ifdef _DEBUG_REFERENCE_MATRIX + #pragma omp critical + std::cout << "\t Thread " << t << " gets buffers from matrix storage\n"; + #endif + // one thread uses the standard matrix buffer + internal::getMatrixBuffers( arr, buf, valbuf, 1, C ); + } + #ifdef _DEBUG_REFERENCE_MATRIX + #pragma omp critical + { + std::cout << "\t Thread " << t << " has SPA array @ " + << static_cast< void * >( arr ) << " and SPA stack @ " + << static_cast< void * >( buf ) << " and SPA values @ " + << static_cast< void * >( valbuf ) << "\n"; + } + #endif + } +#else + #ifdef _DEBUG_REFERENCE_MATRIX + std::cout << "\t Reference backend gets buffers from global buffer\n"; + #endif + internal::getMatrixBuffers( arr, buf, valbuf, 1, C ); + (void) md; #endif + } } // end namespace grb::internal diff --git a/include/graphblas/utils.hpp b/include/graphblas/utils.hpp index 3ab508eec..4a2235501 100644 --- a/include/graphblas/utils.hpp +++ b/include/graphblas/utils.hpp @@ -384,9 +384,11 @@ namespace grb { return ret; } } - - /** Specialisation for void-valued matrice's masks */ - template< Descriptor descriptor, typename MatrixDataType, typename ValuesType > + /** Specialisation for void-valued matrix masks */ + template< + Descriptor descriptor, + typename MatrixDataType, typename ValuesType + > static bool interpretMatrixMask( const bool &assigned, const ValuesType * const values, @@ -398,8 +400,11 @@ namespace grb { return interpretMask< descriptor, ValuesType >( assigned, values, k ); } - /** Specialisation for void-valued matrice's masks */ - template< Descriptor descriptor, typename MatrixDataType, typename ValuesType > + /** Specialisation for void-valued matrix masks */ + template< + Descriptor descriptor, + typename MatrixDataType, typename ValuesType + > static bool interpretMatrixMask( const bool &assigned, const ValuesType * const, diff --git a/include/graphblas/utils/parser/matrixFileReaderBase.hpp b/include/graphblas/utils/parser/matrixFileReaderBase.hpp index c5cdf8b7a..15aa3e4f8 100644 --- a/include/graphblas/utils/parser/matrixFileReaderBase.hpp +++ b/include/graphblas/utils/parser/matrixFileReaderBase.hpp @@ -280,7 +280,7 @@ namespace grb { properties._nz = nz; properties._entries = entries; properties._pattern = pattern; - properties._symmetric = symmetric; + properties._symmetric = symmetric ? Symmetric : General; properties._direct = direct; properties._symmetricmap = symmetricmap; // check for existance of file diff --git a/src/graphblas/hyperdags/hyperdags.cpp b/src/graphblas/hyperdags/hyperdags.cpp index 256b31fba..320b522f5 100644 --- a/src/graphblas/hyperdags/hyperdags.cpp +++ b/src/graphblas/hyperdags/hyperdags.cpp @@ -215,6 +215,9 @@ std::string grb::internal::hyperdags::toString( case SET_MATRIX_MATRIX_INPUT2: return "set( matrix, matrix, scalar )"; + case SET_MATRIX_MATRIX_MASKED: + return "set( matrix, matrix, matrix )"; + case MXM_MATRIX_MATRIX_MATRIX_MONOID: return "mxm( matrix, matrix, matrix, monoid, scalar, scalar )"; diff --git a/tests/unit/id.cpp b/tests/unit/id.cpp index a7a7cdb0b..8711369ad 100644 --- a/tests/unit/id.cpp +++ b/tests/unit/id.cpp @@ -216,7 +216,7 @@ void grb_program2( const struct input &in, struct output &out ) { } /** - * Test for move assignement id cleanup. + * Test for move assignment id cleanup. * * Creating and performing move assignment on multiple new objects and check * for collisions. diff --git a/tests/unit/matrixSet.cpp b/tests/unit/matrixSet.cpp index c6ab6739d..da3d48f64 100644 --- a/tests/unit/matrixSet.cpp +++ b/tests/unit/matrixSet.cpp @@ -19,6 +19,7 @@ #include #include +#include using namespace grb; @@ -27,8 +28,356 @@ static const int data1[ 15 ] = { 4, 7, 4, 6, 4, 7, 1, 7, 3, 6, 7, 5, 1, 8, 7 }; static const size_t I[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 8, 7, 6 }; static const size_t J[ 15 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 5, 7, 5, 1 }; +/** Generic implementation of masked tests */ +template< + Descriptor descr = descriptors::no_operation, + typename Tout, typename Tmask, typename Tin +> +RC masked_tests_generic_impl( + RC &rc, grb::Matrix< Tout > &output, + const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input, + const size_t n +) { + const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0; + + std::cout << "\t\t with structural descriptor\n"; + rc = grb::set< descr | descriptors::structural >( output, mask, input ); + if( rc != SUCCESS ) { + std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n"; + return rc; + } + if( grb::nnz( output ) != 2 * n - 1 ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n"; + rc = FAILED; + } + for( const auto &triplet : output ) { + if( + triplet.first.first != triplet.first.second && + triplet.first.first != triplet.first.second - 1 + ) { + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " + << triplet.first.second << " ), value " << triplet.second << ".\n"; + rc = FAILED; + } + if( triplet.first.first != static_cast< size_t >(triplet.second) ) { + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " + << triplet.first.second << " ) with value " << triplet.second; + std::cerr << ", expected value "<< triplet.first.first <<".\n"; + rc = FAILED; + } + } + if( rc != SUCCESS ) { return rc; } + + std::cout << "\t\t without descriptor\n"; + rc = grb::set< descr >( output, mask, input ); + if( rc != SUCCESS ) { + std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n"; + return rc; + } + if( emptyMask ) { + if( grb::nnz( output ) != 2 * n - 1 ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n"; + rc = FAILED; + } + } else { + if( grb::nnz( output ) != n ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << n <<".\n"; + rc = FAILED; + } + } + for( const auto &triplet : output ) { + if( emptyMask ) { + if( triplet.first.first != triplet.first.second && + triplet.first.first != triplet.first.second - 1 + ) { + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " + << triplet.first.second << " ), value " << triplet.second << ".\n"; + rc = FAILED; + } + } else { + if( triplet.first.first != triplet.first.second ) { + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " + << triplet.first.second << " ), value " << triplet.second << ".\n"; + rc = FAILED; + } + } + if( triplet.first.first != static_cast< size_t >(triplet.second) ) { + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " + << triplet.first.second << " ) with value " << triplet.second; + std::cerr << ", expected value "<< triplet.first.first <<".\n"; + rc = FAILED; + } + } + if( rc != SUCCESS ) { return rc; } + + // done + return rc; +} + +/** Specialisation for void output */ +template< + Descriptor descr = descriptors::no_operation, + typename Tmask, typename Tin +> +RC masked_tests_generic_impl( + RC &rc, grb::Matrix< void > &output, + const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input, + const size_t n +) { + const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0; + + std::cout << "\t\t with structural descriptor\n"; + rc = grb::set< descr | descriptors::structural >( output, mask, input ); + if( rc != SUCCESS ) { + std::cerr << "\t grb::set structural (matrix to matrix masked) FAILED\n"; + return rc; + } + if( grb::nnz( output ) != 2 * n - 1 ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n"; + rc = FAILED; + } + for( const auto &triplet : output ) { + if( + triplet.first != triplet.second && + triplet.first != triplet.second - 1 + ) { + std::cerr << "\t unexpected entry at ( " << triplet.first << ", " + << triplet.second << " ), no value (pattern matrix).\n"; + rc = FAILED; + } + } + if( rc != SUCCESS ) { return rc; } + + std::cout << "\t\t without descriptor\n"; + rc = grb::set< descr >( output, mask, input ); + if( rc != SUCCESS ) { + std::cerr << "\t grb::set (matrix to matrix masked) FAILED\n"; + return rc; + } + if( emptyMask ) { + if( grb::nnz( output ) != 2 * n - 1 ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n"; + rc = FAILED; + } + } else { + if( grb::nnz( output ) != n ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << n <<".\n"; + rc = FAILED; + } + } + for( const auto &triplet : output ) { + if( emptyMask ) { + if( triplet.first != triplet.second && + triplet.first != triplet.second - 1 + ) { + std::cerr << "\t unexpected entry at ( " << triplet.first << ", " + << triplet.second << " ), no value (pattern matrix).\n"; + rc = FAILED; + } + } else { + if( triplet.first != triplet.second ) { + std::cerr << "\t unexpected entry at ( " << triplet.first << ", " + << triplet.second << " ), no value (pattern matrix).\n"; + rc = FAILED; + } + } + } + if( rc != SUCCESS ) { return rc; } + + // done + return rc; +} + +/** Implementation of masked tests for non-void masks (nvm). */ +template< + Descriptor descr = descriptors::no_operation, + typename Tout, typename Tmask, typename Tin +> +RC masked_tests_nvm_impl( + RC &rc, grb::Matrix< Tout > &output, + const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input, + const size_t n +) { + const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0; + + std::cout << "\t\t with invert_mask descriptor\n"; + rc = grb::set< descr | descriptors::invert_mask >( output, mask, input ); + if( rc != SUCCESS ) { + std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n"; + return rc; + } + if( emptyMask ) { + if( grb::nnz( output ) != 2 * n - 1 ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n"; + rc = FAILED; + } + } else { + if( grb::nnz( output ) != n - 1 ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << n - 1 <<".\n"; + rc = FAILED; + } + } + for( const auto &triplet : output ) { + if( emptyMask ) { + if( triplet.first.first != triplet.first.second && + triplet.first.first != triplet.first.second - 1 + ) { + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " + << triplet.first.second << " ), value " << triplet.second << ".\n"; + rc = FAILED; + } + } else { + if( triplet.first.first != triplet.first.second - 1 ) { + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " + << triplet.first.second << " ), value " << triplet.second << ".\n"; + rc = FAILED; + } + } + if( triplet.first.first != static_cast< size_t >(triplet.second) ) { + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " + << triplet.first.second << " ) with value " << triplet.second; + std::cerr << ", expected value "<< triplet.first.first <<".\n"; + rc = FAILED; + } + } + if( rc != SUCCESS ) { return rc; } + + // done + return rc; +} + +/** Specialisation for void output */ +template< + Descriptor descr = descriptors::no_operation, + typename Tmask, typename Tin +> +RC masked_tests_nvm_impl( + RC &rc, grb::Matrix< void > &output, + const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input, + const size_t n +) { + const bool emptyMask = grb::nrows( mask ) == 0 || grb::ncols( mask ) == 0; + + std::cout << "\t\t with invert_mask descriptor\n"; + rc = grb::set< descr | descriptors::invert_mask >( output, mask, input ); + if( rc != SUCCESS ) { + std::cerr << "\t grb::set invert mask (matrix to matrix masked) FAILED\n"; + return rc; + } + if( emptyMask ) { + if( grb::nnz( output ) != 2 * n - 1 ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << 2 * n - 1 <<".\n"; + rc = FAILED; + } + } else { + if( grb::nnz( output ) != n - 1 ) { + std::cerr << "\t unexpected number of output elements ( " + << grb::nnz( output ) << " ), expected " << n - 1 <<".\n"; + rc = FAILED; + } + } + for( const auto &triplet : output ) { + if( emptyMask ) { + if( triplet.first != triplet.second && + triplet.first != triplet.second - 1 + ) { + std::cerr << "\t unexpected entry at ( " << triplet.first << ", " + << triplet.second << " ), no value (pattern matrix).\n"; + rc = FAILED; + } + } else { + if( triplet.first != triplet.second - 1 ) { + std::cerr << "\t unexpected entry at ( " << triplet.first << ", " + << triplet.second << " ), no value (pattern matrix).\n"; + rc = FAILED; + } + } + } + if( rc != SUCCESS ) { return rc; } + + // done + return rc; +} + +/** Dispatch for generic masked tests */ +template< typename Tout, typename Tmask, typename Tin > +RC masked_tests( + RC &rc, grb::Matrix< Tout > &output, + const grb::Matrix< Tmask > &mask, const grb::Matrix< Tin > &input, + const size_t n +) { + if( masked_tests_generic_impl( rc, output, mask, input, n ) != SUCCESS ) { + return rc; + } + return masked_tests_nvm_impl( rc, output, mask, input, n ); +} + +/** Specialised dispatch for masked tests with void masks */ +template< typename Tout, typename Tin > +RC masked_tests( + RC &rc, grb::Matrix< Tout > &output, + const grb::Matrix< void > &mask, const grb::Matrix< Tin > &input, + const size_t n +) { + const grb::RC ret = masked_tests_generic_impl( rc, output, mask, input, n ); + std::cout << "\t\t invert_mask descriptor SKIPPED\n"; + return ret; +} + +/** Specialised dispatch for masked tests with no-cast domains */ +template< typename T > +RC masked_tests( + RC &rc, grb::Matrix< T > &output, + const grb::Matrix< bool > &mask, const grb::Matrix< T > &input, + const size_t n +) { + if( masked_tests_generic_impl( rc, output, mask, input, n ) != SUCCESS ) { + return rc; + } + if( masked_tests_nvm_impl( rc, output, mask, input, n ) != SUCCESS ) { + return rc; + } + std::cout << "\t re-running previous tests with no_casting descriptor\n"; + if( + masked_tests_generic_impl< descriptors::no_casting >( + rc, output, mask, input, n + ) != SUCCESS + ) { + return rc; + } + return masked_tests_nvm_impl< descriptors::no_casting >( + rc, output, mask, input, n ); +} + void grb_program( const size_t &n, grb::RC &rc ) { - // initialize test + // initialize non-masked test containers + grb::Matrix< double > A( n, n ); + grb::Matrix< double > B( n, n ); + grb::Matrix< void > C( n, n ); + grb::Matrix< void > D( n, n ); + grb::Matrix< unsigned int > E( n, n ); + + // initalize masked test containers + grb::Matrix< int > mask( n, n ); + grb::Matrix< bool > maskBool( n, n ); + grb::Matrix< void > maskVoid( n, n ); + grb::Matrix< double > maskEmpty( 0, 0 ); + grb::Matrix< int > input( n, n ); + grb::Matrix< void > inputVoid( n, n ); + grb::Matrix< float > inputFloat( n, n ); + grb::Matrix< int > output( n, n ); + grb::Matrix< void > outputVoid( n, n ); + + // initialise non-masked test data int chk[ 10 ][ 10 ]; for( size_t i = 0; i < 10; ++i ) { for( size_t j = 0; j < 10; ++j ) { @@ -38,21 +387,17 @@ void grb_program( const size_t &n, grb::RC &rc ) { for( size_t k = 0; k < 15; ++k ) { chk[ I[ k ] ][ J[ k ] ] = data1[ k ]; } - grb::Matrix< double > A( n, n ); - grb::Matrix< double > B( n, n ); - grb::Matrix< void > C( n, n ); - grb::Matrix< void > D( n, n ); - grb::Matrix< unsigned int > E( n, n ); + rc = grb::resize( A, 15 ); if( rc == SUCCESS ) { rc = grb::buildMatrixUnique( A, I, J, data1, 15, SEQUENTIAL ); for( const auto &triplet : A ) { if( triplet.first.first >= 10 || triplet.first.second >= 10 ) { - std::cerr << "\tunexpected entry at A( " << triplet.first.first << ", " + std::cerr << "\t unexpected entry at A( " << triplet.first.first << ", " << triplet.first.second << " ).\n"; rc = FAILED; } else if( chk[ triplet.first.first ][ triplet.first.second ] != triplet.second ) { - std::cerr << "\tunexpected entry at A( " << triplet.first.first << ", " + std::cerr << "\t unexpected entry at A( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second; if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) { std::cerr << ", expected no entry here.\n"; @@ -64,22 +409,86 @@ void grb_program( const size_t &n, grb::RC &rc ) { } } } - if( rc == SUCCESS ) { - rc = grb::resize( B, 15 ); + rc = rc ? rc : grb::resize( B, 15 ); + rc = rc ? rc : grb::resize( C, 15 ); + rc = rc ? rc : grb::resize( D, 15 ); + rc = rc ? rc : grb::resize( E, 15 ); + rc = rc ? rc : grb::resize( output, 15 ); + if( rc != SUCCESS || grb::nnz( A ) != 15 ) { + std::cerr << "\tinitialisation FAILED\n"; + return; } - if( rc == SUCCESS ) { - rc = grb::resize( C, 15 ); + + // initialise data for masked-set tests + // - mask will be an n by n matrix with its diagonal set to 1 and its + // superdiagonal set to 0. + // - input will be an n by n matrix with each element on its diagonal and + // superdiagonal set to its row index (meaning the entries on row 0 have + // value 0). + size_t * const I_mask = new size_t[ 2 * n - 1 ]; + size_t * const J_mask = new size_t[ 2 * n - 1 ]; + int * const mask_vals = new int[ 2 * n - 1 ]; + int * const input_vals = new int[ 2 * n - 1 ]; + if( I_mask == nullptr || J_mask == nullptr || mask_vals == nullptr || + input_vals == nullptr + ) { + std::cerr << "\t initialisation FAILED\n"; + return; } - if( rc == SUCCESS ) { - rc = grb::resize( D, 15 ); + for( size_t k = 0; k < n; ++k ) { + I_mask[ k ] = J_mask[ k ] = k; + mask_vals[ k ] = 1; + input_vals[ k ] = static_cast< int >( k ); + if( k < n - 1 ) { + I_mask[ n + k ] = k; + J_mask[ n + k ] = k + 1; + mask_vals [ n + k ] = 0; + input_vals[ n + k ] = k; + } } - if( rc == SUCCESS ) { - rc = grb::resize( E, 15 ); + rc = grb::buildMatrixUnique( mask, I_mask, J_mask, mask_vals, 2 * n - 1, + SEQUENTIAL ); + if( rc != SUCCESS ) { + std::cerr << "\t buildMatrixUnique of mask matrix FAILED\n"; + return; } - if( rc != SUCCESS || grb::nnz( A ) != 15 ) { - std::cerr << "\tinitialisation FAILED\n"; + rc = grb::buildMatrixUnique( maskBool, I_mask, J_mask, mask_vals, 2 * n - 1, + SEQUENTIAL ); + if( rc != SUCCESS ) { + std::cerr << "\t buildMatrixUnique of maskBool matrix FAILED\n"; + return; + } + try { + maskVoid = grb::algorithms::matrices< void >::identity( n ); + } catch( ... ) { + std::cerr << "\t constructing maskVoid FAILED\n"; + return; + } + rc = grb::buildMatrixUnique( input, I_mask, J_mask, input_vals, 2 * n - 1, + SEQUENTIAL ); + if( rc != SUCCESS ) { + std::cerr << "\t buildMatrixUnique of input matrix FAILED\n"; + return; + } + rc = grb::buildMatrixUnique( inputFloat, I_mask, J_mask, input_vals, 2 * n - 1, + SEQUENTIAL ); + if( rc != SUCCESS ) { + std::cerr << "\t buildMatrixUnique of inputFloat matrix FAILED\n"; + return; + } + rc = grb::resize( inputVoid, 2 * n - 1 ); + rc = rc ? rc : grb::resize( outputVoid, 2 * n - 1 ); + if( rc != SUCCESS ) { + std::cerr << "\t error resizing matrices for masked tests\n"; return; } + // postpone materialisation of inputVoid since it relies on unmasked grb::set + // (which is itself unit-tested later) + + delete [] I_mask; + delete [] J_mask; + delete [] mask_vals; + delete [] input_vals; std::cout << "\t test initialisation complete\n"; // check grb::set for non-voids @@ -96,11 +505,11 @@ void grb_program( const size_t &n, grb::RC &rc ) { } for( const auto &triplet : B ) { if( triplet.first.first >= 10 || triplet.first.second >= 10 ) { - std::cerr << "\tunexpected entry at B( " << triplet.first.first << ", " + std::cerr << "\t unexpected entry at B( " << triplet.first.first << ", " << triplet.first.second << " ).\n"; rc = FAILED; } else if( chk[ triplet.first.first ][ triplet.first.second ] != triplet.second ) { - std::cerr << "\tunexpected entry at B( " << triplet.first.first << ", " + std::cerr << "\t unexpected entry at B( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second; if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) { std::cerr << ", expected no entry here.\n"; @@ -175,14 +584,14 @@ void grb_program( const size_t &n, grb::RC &rc ) { } for( const auto &triplet : E ) { if( triplet.first.first >= 10 || triplet.first.second >= 10 ) { - std::cerr << "\tunexpected entry at E( " << triplet.first.first << ", " + std::cerr << "\t unexpected entry at E( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n"; rc = FAILED; } else if( static_cast< unsigned int >( chk[ triplet.first.first ][ triplet.first.second ] ) != triplet.second ) { - std::cerr << "\tunexpected entry at E( " << triplet.first.first << ", " + std::cerr << "\t unexpected entry at E( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second; if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) { std::cerr << ", expected no entry here.\n"; @@ -209,11 +618,11 @@ void grb_program( const size_t &n, grb::RC &rc ) { } for( const auto &triplet : E ) { if( triplet.first.first >= 10 || triplet.first.second >= 10 ) { - std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ), value " << triplet.second << ".\n"; rc = FAILED; } else if( 117 != triplet.second ) { - std::cerr << "\tunexpected entry at ( " << triplet.first.first << ", " + std::cerr << "\t unexpected entry at ( " << triplet.first.first << ", " << triplet.first.second << " ) with value " << triplet.second; if( chk[ triplet.first.first ][ triplet.first.second ] == 0 ) { std::cerr << ", expected no entry here.\n"; @@ -224,6 +633,109 @@ void grb_program( const size_t &n, grb::RC &rc ) { } } if( rc != SUCCESS ) { return; } + + // check masked matrix set + // first, finish initialisation + rc = grb::set( inputVoid, input ); + rc = rc ? rc : grb::resize( output, 2 * n - 1 ); + if( rc != SUCCESS || grb::nnz( inputVoid ) != 2 * n - 1 ) { + std::cerr << "\t error in inputVoid (an earlier test likely failed)\n"; + if( rc == SUCCESS ) { rc = FAILED; } + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, " + << "empty mask\n"; + if( masked_tests( rc, output, maskEmpty, input, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, " + << "non-empty mask\n"; + if( masked_tests( rc, output, mask, input, n ) != grb::SUCCESS ) { return; } + + std::cout << "\t testing set( matrix, mask, matrix ), non-void, no-cast, " + << "non-empty Boolean mask\n"; + if( masked_tests( rc, output, maskBool, input, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), non-void, casting from " + << "float to int, empty mask\n"; + if( masked_tests( rc, output, maskEmpty, inputFloat, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), non-void, casting from " + << "float to int, non-empty mask\n"; + if( masked_tests( rc, output, mask, inputFloat, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), non-void, casting from " + << "float to int, non-empty Boolean mask\n"; + if( masked_tests( rc, output, maskBool, inputFloat, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), void-to-void (no cast), " + << "empty mask\n"; + if( masked_tests( rc, outputVoid, maskEmpty, inputVoid, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), void-to-void (no cast), " + << "non-empty mask\n"; + if( masked_tests( rc, outputVoid, mask, inputVoid, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), void-to-void (no cast), " + << "non-empty Boolean mask\n"; + if( masked_tests( rc, outputVoid, maskBool, inputVoid, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), int-to-void, casting " + << "(sort of), empty mask\n"; + if( masked_tests( rc, outputVoid, maskEmpty, input, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), int-to-void, casting " + << "(sort of), non-empty mask\n"; + if( masked_tests( rc, outputVoid, mask, input, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), int-to-void, casting " + << "(sort of), Boolean mask\n"; + if( masked_tests( rc, outputVoid, maskBool, input, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), float-to-void, casting " + << "(sort of), empty mask\n"; + if( + masked_tests( rc, outputVoid, maskEmpty, inputFloat, n ) != + grb::SUCCESS + ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), float-to-void, casting " + << "(sort of), non-empty mask\n"; + if( masked_tests( rc, outputVoid, mask, inputFloat, n ) != grb::SUCCESS ) { + return; + } + + std::cout << "\t testing set( matrix, mask, matrix ), float-to-void, casting " + << "(sort of), Boolean mask\n"; + if( masked_tests( rc, outputVoid, maskBool, inputFloat, n ) != grb::SUCCESS ) { + return; + } + + // done } int main( int argc, char ** argv ) { diff --git a/tests/unit/unittests.sh b/tests/unit/unittests.sh index 0b49d2421..6abb78e21 100755 --- a/tests/unit/unittests.sh +++ b/tests/unit/unittests.sh @@ -484,6 +484,12 @@ for MODE in ${MODES}; do grep "Test OK" ${TEST_OUT_DIR}/matrixSet_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" echo " " + echo ">>> [x] [ ] Testing grb::set (matrices), size 10 000" + $runner ${TEST_BIN_DIR}/matrixSet_${MODE}_${BACKEND} 10000 2> ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.log + head -1 ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.log + grep "Test OK" ${TEST_OUT_DIR}/matrixSet_large_${MODE}_${BACKEND}_${P}_${T}.log || echo "Test FAILED" + echo " " + echo ">>> [x] [ ] Testing grb::set (matrix, value)" $runner ${TEST_BIN_DIR}/setMatrixValue_${MODE}_${BACKEND} 2> ${TEST_OUT_DIR}/setMatrixValue_${MODE}_${BACKEND}_${P}_${T}.err 1> ${TEST_OUT_DIR}/setMatrixValue_${MODE}_${BACKEND}_${P}_${T}.log head -1 ${TEST_OUT_DIR}/setMatrixValue_${MODE}_${BACKEND}_${P}_${T}.log