diff --git a/CMakeLists.txt b/CMakeLists.txt index 02c49eb37..742af7511 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,7 @@ project( GraphBLAS DESCRIPTION "The ultimate engine for sparse computation" LANGUAGES CXX C ) -set( CMAKE_CXX_STANDARD 11 ) +set( CMAKE_CXX_STANDARD 14 ) set( CMAKE_CXX_STANDARD_REQUIRED ON ) # install within the build directory by default (NOT to /usr/local or the likes) @@ -52,13 +52,14 @@ endif() option( WITH_REFERENCE_BACKEND "With Reference backend" ON ) option( WITH_OMP_BACKEND "With OMP backend" ON ) option( WITH_HYPERDAGS_BACKEND "With Hyperdags backend" ON ) +option( WITH_ASCEND_BACKEND "With Ascend backend" ON ) if( WITH_HYPERDAGS_BACKEND ) if( NOT DEFINED WITH_HYPERDAGS_USING ) set( WITH_HYPERDAGS_USING "reference" ) endif() endif() option( WITH_NONBLOCKING_BACKEND "With Nonblocking backend" ON ) -option( WITH_NUMA "With NUMA support" ON ) +option( WITH_NUMA "With NUMA support" OFF ) option( LPF_INSTALL_PATH "Path to the LPF tools for the BSP1D and Hybrid backends" OFF ) # the following options depend on LPF_INSTALL_PATH being set include(CMakeDependentOption) @@ -192,6 +193,12 @@ if( WITH_HYBRID_BACKEND ) endif() endif() +# Enable nonblocking backend if ascend is active +if( WITH_ASCEND_BACKEND ) + set( WITH_NONBLOCKING_BACKEND ON ) + message( STATUS "Enabling compilation of nonblocking backend: required by the Ascend backend" ) +endif() + # Enabling reference_omp backend if non-blocking is active if( WITH_NONBLOCKING_BACKEND ) if( NOT WITH_OMP_BACKEND ) diff --git a/bootstrap.sh b/bootstrap.sh index e24d75d45..3717a65a0 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -80,6 +80,7 @@ the location where LPF is installed" echo " optional; default value is reference" echo " clashes with --no-hyperdags" echo " --no-nonblocking - disables the nonblocking backend" + echo " --no-ascend - disables the ascend backend" echo " --[debug | coverage]-build - build the project with debug | coverage options (tests will run much slower!)" echo " --generator= - set the generator for CMake (otherwise use CMake's default)" echo " --show - show generation commands instead of running them" @@ -102,6 +103,7 @@ reference=yes hyperdags=yes hyperdags_using=reference nonblocking=yes +ascend=yes banshee=no lpf=no show=no @@ -176,6 +178,9 @@ or assume default paths (--with-lpf)" --no-nonblocking) nonblocking=no ;; + --no-ascend) + ascend=no + ;; --debug-build) debug_build=yes ;; @@ -286,7 +291,7 @@ CURRENT_DIR="$(pwd)" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" # CONFIGURE CMAKE BUILDING INFRASTRUCTURE -if [[ "${reference}" == "yes" || "${lpf}" == "yes" || "${nonblocking}" == "yes" ]]; then +if [[ "${reference}" == "yes" || "${lpf}" == "yes" || "${nonblocking}" == "yes" || "${ascend}" == "yes" ]]; then BUILD_DIR="${CURRENT_DIR}" printf "Checking for cmake..." @@ -363,6 +368,9 @@ the current directory before invocation or confirm the deletion of its content w if [[ "${nonblocking}" == "no" ]]; then CMAKE_OPTS+=" -DWITH_NONBLOCKING_BACKEND=OFF" fi + if [[ "${ascend}" == "no" ]]; then + CMAKE_OPTS+=" -DWITH_ASCEND_BACKEND=OFF" + fi if [[ "${lpf}" == "yes" ]]; then CMAKE_OPTS+=" -DLPF_INSTALL_PATH='${ABSOLUTE_LPF_INSTALL_PATH}'" fi diff --git a/cmake/AddGRBInstall.cmake b/cmake/AddGRBInstall.cmake index 94bd58f31..49ae44ab0 100644 --- a/cmake/AddGRBInstall.cmake +++ b/cmake/AddGRBInstall.cmake @@ -47,7 +47,7 @@ set( SHMEM_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/sequential" ) set( HYPERDAGS_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hyperdags" ) set( BSP1D_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/spmd" ) set( HYBRID_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/hybrid" ) - +set( ASCEND_BACKEND_INSTALL_DIR "${BINARY_LIBRARIES_INSTALL_DIR}/ascend" ) # addBackendWrapperGenOptions @@ -146,6 +146,14 @@ if( WITH_NONBLOCKING_BACKEND ) ) endif() +if( WITH_ASCEND_BACKEND ) + addBackendWrapperGenOptions( "ascend" + COMPILE_DEFINITIONS "${ASCEND_SELECTION_DEFS};${ASCEND_INCLUDE_DEFS}" + LINK_FLAGS "'${SHMEM_BACKEND_INSTALL_DIR}/lib${BACKEND_LIBRARY_OUTPUT_NAME}.a'" + "'${ALP_UTILS_INSTALL_DIR}/lib${ALP_UTILS_LIBRARY_OUTPUT_NAME}.a'" "${NUMA_LFLAG}" + ) +endif() + # distributed memory backends if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND ) assert_valid_variables( LPFRUN LPFCPP ) diff --git a/cmake/AddGRBVars.cmake b/cmake/AddGRBVars.cmake index fab0f9ac9..a5235a519 100644 --- a/cmake/AddGRBVars.cmake +++ b/cmake/AddGRBVars.cmake @@ -33,6 +33,7 @@ set( BSP1D_BACKEND_DEFAULT_NAME "backend_bsp1d" ) set( HYBRID_BACKEND_DEFAULT_NAME "backend_hybrid" ) set( HYPERDAGS_BACKEND_DEFAULT_NAME "backend_hyperdags" ) set( NONBLOCKING_BACKEND_DEFAULT_NAME "backend_nonblocking" ) +set( ASCEND_BACKEND_DEFAULT_NAME "backend_ascend" ) ### COMPILER DEFINITIONS FOR HEADERS INCLUSION AND FOR BACKEND SELECTION @@ -41,6 +42,7 @@ set( REFERENCE_INCLUDE_DEFS "_GRB_WITH_REFERENCE" ) set( REFERENCE_OMP_INCLUDE_DEFS "_GRB_WITH_OMP" ) set( HYPERDAGS_INCLUDE_DEFS "_GRB_WITH_HYPERDAGS" ) set( NONBLOCKING_INCLUDE_DEFS "_GRB_WITH_NONBLOCKING" ) +set( ASCEND_INCLUDE_DEFS "_GRB_WITH_ASCEND" ) set( LPF_INCLUDE_DEFS "_GRB_WITH_LPF" ) # compiler definitions to select a backend @@ -51,6 +53,7 @@ set( HYPERDAGS_SELECTION_DEFS "_GRB_WITH_HYPERDAGS_USING=${WITH_HYPERDAGS_USING}" ) set( NONBLOCKING_SELECTION_DEFS "_GRB_BACKEND=nonblocking" ) +set( ASCEND_SELECTION_DEFS "_GRB_BACKEND=ascend" ) set( BSP1D_SELECTION_DEFS "_GRB_BACKEND=BSP1D" "_GRB_BSP1D_BACKEND=reference" @@ -64,7 +67,7 @@ set( HYBRID_SELECTION_DEFS set( NO_NUMA_DEF "_GRB_NO_LIBNUMA" ) ### **ALL** BACKENDS, EVEN IF NOT ENABLED BY USER -set( ALL_BACKENDS "reference" "reference_omp" "hyperdags" "nonblocking" "bsp1d" "hybrid" ) +set( ALL_BACKENDS "reference" "reference_omp" "hyperdags" "nonblocking" "ascend" "bsp1d" "hybrid" ) # list of user-enabled backends, for tests and wrapper scripts (do not change!) set( AVAILABLE_BACKENDS "" ) @@ -90,6 +93,10 @@ if( WITH_NONBLOCKING_BACKEND ) list( APPEND AVAILABLE_BACKENDS "nonblocking" ) endif() +if( WITH_ASCEND_BACKEND ) + list( APPEND AVAILABLE_BACKENDS "ascend" ) +endif() + # distributed memory backends if( WITH_BSP1D_BACKEND ) list( APPEND AVAILABLE_BACKENDS "bsp1d" ) diff --git a/docs/Build_and_test_infra.md b/docs/Build_and_test_infra.md index 8e28e47cb..d0d4fd3c0 100644 --- a/docs/Build_and_test_infra.md +++ b/docs/Build_and_test_infra.md @@ -726,7 +726,7 @@ build path, with set_target_properties( backend_example_static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/example_output_dir" ) ``` -1. add the new library to the `libs` target, which allows users to compile all +7. add the new library to the `libs` target, which allows users to compile all backend libraries at once ```cmake diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9a6affa1a..87f2d48b1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -14,7 +14,7 @@ # limitations under the License. # -assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND ) +assert_defined_variables( WITH_REFERENCE_BACKEND WITH_OMP_BACKEND WITH_ASCEND_BACKEND ) # target listing all examples, to build them at once with 'make examples' add_custom_target( examples) @@ -31,3 +31,57 @@ if( WITH_OMP_BACKEND ) add_dependencies( examples sp_reference_omp ) endif() +if( WITH_ASCEND_BACKEND ) + add_executable( alp_ascend_addOp_ascend unittests/alp_ascend_addOp.cpp ) + target_link_libraries( alp_ascend_addOp_ascend backend_ascend common_flags ) + add_dependencies( examples alp_ascend_addOp_ascend ) +endif() + +if( WITH_ASCEND_BACKEND ) + add_executable( alp_ascend_addOpv1_ascend unittests/alp_ascend_addOpv1.cpp ) + target_link_libraries( alp_ascend_addOpv1_ascend backend_ascend common_flags ) + add_dependencies( examples alp_ascend_addOpv1_ascend ) +endif() + +if( WITH_ASCEND_BACKEND ) + add_executable( alp_ascend_movedataOpv01_ascend unittests/alp_ascend_movedataOpv01.cpp ) + target_link_libraries( alp_ascend_movedataOpv01_ascend backend_ascend common_flags ) + add_dependencies( examples alp_ascend_movedataOpv01_ascend ) +endif() + +if( WITH_ASCEND_BACKEND ) + add_executable( alp_ascend_softmaxOp_ascend unittests/alp_ascend_softmaxOp.cpp ) + target_link_libraries( alp_ascend_softmaxOp_ascend backend_ascend common_flags ) + add_dependencies( examples alp_ascend_softmaxOp_ascend ) +endif() + +if( WITH_ASCEND_BACKEND ) + add_executable( alp_ascend_softmaxOpv1_ascend unittests/alp_ascend_softmaxOpv1.cpp ) + target_link_libraries( alp_ascend_softmaxOpv1_ascend backend_ascend common_flags ) + add_dependencies( examples alp_ascend_softmaxOpv1_ascend ) +endif() + +if( WITH_ASCEND_BACKEND ) + add_executable( alp_ascend_softmaxOpv3_ascend unittests/alp_ascend_softmaxOpv3.cpp ) + target_link_libraries( alp_ascend_softmaxOpv3_ascend backend_ascend common_flags ) + add_dependencies( examples alp_ascend_softmaxOpv3_ascend ) +endif() + +if( WITH_ASCEND_BACKEND ) + add_executable( alp_ascend_softmaxOpv4_ascend unittests/alp_ascend_softmaxOpv4.cpp ) + target_link_libraries( alp_ascend_softmaxOpv4_ascend backend_ascend common_flags ) + add_dependencies( examples alp_ascend_softmaxOpv4_ascend ) +endif() + +if( WITH_ASCEND_BACKEND ) + add_executable( alp_ascend_onlinesoftmaxOp_ascend unittests/alp_ascend_onlinesoftmaxOp.cpp ) + target_link_libraries( alp_ascend_onlinesoftmaxOp_ascend backend_ascend common_flags ) + add_dependencies( examples alp_ascend_onlinesoftmaxOp_ascend ) +endif() + +if( WITH_ASCEND_BACKEND ) + add_executable( ascend_flashattentionOp_ascend ascend_flashattentionOp.cpp ) + target_link_libraries( ascend_flashattentionOp_ascend backend_ascend common_flags ) + #add_dependencies( examples ascend_flashattentionOp_ascend ) +endif() + diff --git a/examples/Makefile b/examples/Makefile new file mode 100644 index 000000000..ee257c7d7 --- /dev/null +++ b/examples/Makefile @@ -0,0 +1,11 @@ + +# Makefile for the Ascend examples + +.PHONY: all + +all: + ascendcc -b 910B -c -o op.o op.cpp + ascendcc -I/home/yzelman/Packages/CANN/samples/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/kernel_invocation/kernel_template/ -b 910B -c -o op_host.o ../examples/ascend_host.cpp + ascendcc -o main.exe op.o op_host.o + LD_LIBRARY_PATH=/home/yzelman/Packages/CANN/x86_64/ascend-toolkit/latest/x86_64-linux/lib64/:/home/yzelman/Packages/CANN/x86_64/ascend-toolkit/latest/x86_64-linux/devlib/x86_64/ ./main.exe + diff --git a/examples/alp_ascend_softMaxOp-manuallyTiled.cpp b/examples/alp_ascend_softMaxOp-manuallyTiled.cpp new file mode 100644 index 000000000..68fc66468 --- /dev/null +++ b/examples/alp_ascend_softMaxOp-manuallyTiled.cpp @@ -0,0 +1,108 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#define DEBUG + +#include + +using namespace alp; + + +// alp::Grid< 1, 4 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +template < typename GridType > +void ascend_code( const GridType &grid, RC &rc ) { + rc = alp::RC::FAILED; + + Tensor Sin( alp::Datatype::FP16, make_axes( 0, 1, 2, 3 ) ); + Tensor Sout( alp::Datatype::FP16, make_axes( 0, 1, 2, 3 ) ); + + rc = grid.forEach( make_axes( 0 ), [ & ] () { + + auto S_block_in_ub = getView( Sin ); // T(1,2,3) + auto S_block_out_ub = getView( Sout ); // T(1,2,3) + Tensor localTensor_ub( alp::Datatype::FP16, make_axes( 1, 2 ) ); // T(1,2) + + rc = grid.forEach( make_axes( 1 ), [ & ] () { + + auto S_block_in = getView( S_block_in_ub ); // T(2,3) + auto S_block_out = getView( S_block_out_ub ); // T(2,3) + auto localTensor = getView( localTensor_ub ); // T(2) + + // T(2) T(2,3) + apply( localTensor, S_block_in, "max", make_axes( 3 ) ); + // T(2,3) T(2,3) T(2) + apply( S_block_out, S_block_in, localTensor, "minus", make_axes( 3 ) ); + // T(2,3) + foldl( S_block_out, "exp" ); + // T(2) T(2,3) + apply( localTensor, S_block_out, "add", make_axes( 3 ) ); + // T(2,3) T(2) + foldl( S_block_out, localTensor, "divide", make_axes( 3 ) ); + // T(2,3) + + } ); + + store( S_block_out ); + + } ); + + return; +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + alp::RC error_code = alp::RC::SUCCESS; + try { + error_code = alp::compile< 1, 4 >( ascend_code, "KernelSoftmax" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != alp::RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/ascend_flashattentionOp-v2.cpp b/examples/ascend_flashattentionOp-v2.cpp new file mode 100644 index 000000000..eec996727 --- /dev/null +++ b/examples/ascend_flashattentionOp-v2.cpp @@ -0,0 +1,178 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +using namespace alp; + + +// alp::Grid< 1, 3 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +void ascend_code( const Grid< 1, 3 > &grid, RC &rc ) { // max shape = ( m, Tr, N ) + rc = alp::RC::FAILED; + + // input // Q and O are 'canonically' aligned. + Tensor Q( grid, type::FP16, axes( 0, 1, 2 ) ); // shape = (m, Tr, N) + Tensor K( grid, type::FP16, axes( 2, 0, 1 ) ); // shape = (N, m, Tr) // transposed shape compared to Q + Tensor V( grid, type::FP16, axes( 2, 0, 1 ) ); // shape = (N, m, Tr) // transposed shape compared to Q + + // temp + Tensor m( grid, type::FP16, axes( 0, 1 ) ); // shape = (m, Tr) = (m, Tr , 1) = ( m, Tr, 1, 1, .. ) + // scalar shape = (1, 1, 1) + // output + Tensor l( grid, type::FP16, axes( 0, 1 ) ); // shape = (m, Tr) + Tensor O( grid, type::FP16, axes( 0, 1, 2 ) ); // shape = (m, Tr, N) + + set( O, 0 ); + set( l, values::zero ); // values::zero is equivalent to 0 + set( m, values::minus_infinity ); + + // forEach cuts the grid into small pieces that are processed concurrently + rc = grid.forEach( [ &grid, &Q, &K, &V, &l, &m ] () { + // a view gets the local part to be processed + // e.g. axes( O_block ) = alp::axes( threadID(), 1, 2 ) + auto O_block = O.getView( grid ); + + auto Q_block = Q.getView( grid ); + + // if tensors are permuted, the "cut" dimension still refers to that defined + // by the grid. E.g. axes( K_block ) = alp::axes( 2, threadID(), 1 ) + auto K_block = K.getView( grid ); + auto V_block = V.getView( grid ); + auto l_block = l.getView( grid ); + auto m_block = m.getView( grid ); + + // tensor version of Stmp = mxm( Q_block, K_block ) + // - tensor contraction along one axis + // - 2 is the contraction axis + Tensor Stmp( grid, type::FP16, axes( 0, 1, 1 ) ); // AJ2D: I think this should have been 1, 1? Or the below mxm ex. is wrong? + Stmp = Q_block( "i", "m", "k" ) * K_block( "k", "j", "m" ); // AJ2D: is this correct in Einstein notation? + // It seems to me to match the below code + // (although I don't get foldl with a semiring) + + // tensor contraction in one axis: + // alp::semiring multiplication and accumualtion operators + // e.g. Stmp[ : , : ] = mxm( Q_block[ threadID(), :, : ], K_block[ :, threadID(), : ] ) + // set( Stmp, values::zero ); + // alp::foldl( Stmp, Q_block, K_block, alp::semiring(), alp::axes( 2 ) ); + // NOTE: in general multiple axes needed with proper reduction rules: + // here, Dim(Stmp) + 2*Dim(axes) = Dim(Q_block) + Dim(Q_block) + + + Tensor tmp( grid, type::FP16, axes( 1 ) ); + set( tmp, m_block ); // AJ2D: here tmp is one-dimensional but m_block is two-dimensional? + // I think this means the parallelised dimension has only one fiber, + // not a block of fibers, perhaps? That could work (though the codegen + // would have to coalesce them back). I had assumed we got back a block + // of some size close to n/p. If we have a block then the following + // seems correct and perhaps more clear? + // Tensor tmp( grid, type::FP16, axes( 0, 1 ) ) + // set( tmp, m_block ) + + // two was the "contraction" axis, e.g. row-wise reduction + max( m_block, Stmp ); // AJ2D: I think here the axes become confusing. If the axes of Stmp are correct + // (I modified it), then the "axes(2)" which used to be here do not match + // any axes in m_bock and Stmp. Translating it into matrix land, Stmp is + // n x n while m_block is m x n. If m = 1 (see above comment block), then + // indeed what max is a reduction, but it remains ambiguous over what + // dimension the reduction should go (rows or columns -- both are the same + // mode). If m > 1, then the semantics I suppose are to broadcast the + // result of max( Stmp ) into m_block? + // + // Would the following perhaps be clearer? + // tmp = max( Stmp( "i", "j" ), "j" ); + // m_block( "i", "j" ) = tmp( "j" ); // broadcast tmp to m_block + + // AJ2D: in the below, I will just assume Einstein notation while simplifying the code + + // 'row-wise' Stmp -= m_block + Stmp( "i", "j" ) -= m_block( "j" ); + + // if no axes are specified apply along all axes + // This is equivalent to reduction with scalar, just inplace + // Stmp = exp(Stmp) + Stmp = exp( Stmp ); + + // tmp=exp(tmp-m_block) + tmp = exp( tmp - m_block ); + + // l_block += rowsum(Stmp) + l_block += sum( Stmp( "i", "j" ), "j" ); + + // 'row-wise' O_block *= tmp + O_block *= tmp; + + // tensor version of O_block = mxm( Stmp, V_block ), i.e., contraction + Oblock( "i", "j", "k" ) += Stmp( "i", "r" ) * V_block( "k", "r", "j" ); + + // 'row-wise' O_block *= 1/l_block + O_block /= l_block; + // or div( O_block, l_block ); + + // l_block = log(m_block) + m_block + l_block = log( m_block ) + m_block; + + // store output + alp::store( O_block ); + alp::store( l_block ); + } ); + + return; +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + alp::RC error_code = alp::RC::SUCCESS; + try { + error_code = alp::compile< 1, 1 >( ascend_code ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != alp::RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/ascend_flashattentionOp-v3.cpp b/examples/ascend_flashattentionOp-v3.cpp new file mode 100644 index 000000000..2db2e0cbd --- /dev/null +++ b/examples/ascend_flashattentionOp-v3.cpp @@ -0,0 +1,146 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +using namespace alp; + + +// alp::Grid< 1, 5 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +void ascend_code( const Grid< 1, 5 > &grid, RC &rc ) { // max shape = ( Tr, Br, Tc, Bc, d ) + rc = alp::RC::FAILED; + + // input // Q and O are 'canonically' aligned. + Tensor Q( grid, type::FP16, axes( 0, 1, 4 ) ); // shape = (Tr, Br, d) + Tensor K( grid, type::FP16, axes( 4, 2, 3 ) ); // shape = (d, Tc, Bc) // transposed shape compared to Q + Tensor V( grid, type::FP16, axes( 4, 2, 3 ) ); // shape = (d, Tc, Bc) // transposed shape compared to Q + + // temp + Tensor m( grid, type::FP16, axes( 0, 1 ) ); // shape = (Tr, Br) = (Tr, Br , 1) = ( Tr, Br, 1, 1, .. ) + // scalar shape = (1, 1, 1) + // output + Tensor l( grid, type::FP16, axes( 0, 1 ) ); // shape = (Tr, Br) + Tensor O( grid, type::FP16, axes( 0, 1, 2 ) ); // shape = (Tr, Br, d) + + set( O, 0 ); + set( l, values::zero ); // values::zero is equivalent to 0 + set( m, values::minus_infinity ); + + // forEach cuts the grid into small pieces that are processed concurrently + rc = grid.forEach( [ &grid, &Q, &K, &V, &l, &m ] () { + // a view gets the local part to be processed + // e.g. axes( O_block ) = alp::axes( threadID(), 1, 4 ) + auto O_block = O.getView( grid ); + auto Q_block = Q.getView( grid ); + auto K_block = K.getView( grid ); + auto V_block = V.getView( grid ); + auto l_block = l.getView( grid ); + auto m_block = m.getView( grid ); + + // tensor version of Stmp = mxm( Q_block, K_block ) + // - tensor contraction along one axis + // - 2 is the contraction axis + Tensor Stmp( grid, type::FP16, axes( 0, 2, 3 ) ); + Stmp = Q_block( "i", "j", "k" ) * K_block( "l", "m", "k" ); + // not contracted and non-stored index imply loop, e.g. loop over "j" here + + + Tensor tmp( grid, type::FP16, axes( 0, 1 ) ); + set( tmp, m_block ); + + // row-wise max + // do this operation for all l indices + m_block( "i", "j" ) = max( m_block( "i", "j" ), Stmp( "i", "k", "l" ) , "l"); + + // row-wise Stmp -= m_block + // do this operation for all l indices + Stmp( "i", "k", "l" ) = minus( Stmp( "i", "k", "l" ), m_block( "i", "j" ), "l" ); + + // if no axes are specified then apply along all axes + // This is equivalent to reduction with scalar, just inplace + // Stmp = exp(Stmp) + Stmp = exp( Stmp ); + + // tmp=exp(tmp-m_block) + tmp = exp( tmp - m_block ); + + // l_block += rowsum(Stmp) + l_block += sum( Stmp( "i", "j", "k" ), "k" ); + + // 'row-wise' O_block *= tmp + O_block *= tmp; + + // tensor version of O_block = mxm( Stmp, V_block ), i.e., contraction + O_block( "i", "j", "k" ) += Stmp( "i", "l", "m" ) * V_block( "k", "r", "j" ); + + // 'row-wise' O_block *= 1/l_block + O_block /= l_block; + + // l_block = log(m_block) + m_block + l_block = log( m_block ) + m_block; + + // store output + alp::store( O_block ); + alp::store( l_block ); + } ); + + return; +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + alp::RC error_code = alp::RC::SUCCESS; + try { + error_code = alp::compile< 1, 5 >( ascend_code ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != alp::RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/ascend_flashattentionOp.cpp b/examples/ascend_flashattentionOp.cpp new file mode 100644 index 000000000..79cb0d84a --- /dev/null +++ b/examples/ascend_flashattentionOp.cpp @@ -0,0 +1,171 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +// alp::Grid< 1, 5 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +template < typename GridType > +void ascend_code( const GridType &grid, alp::RC &rc ) { + + // shape = ( Tr,Tc,Br,Bc,d ) + // Tr = number for row-blocks, Br = row-length of rowblocks; Tr*Tc = N + // Tc = number for column-blocks, Bc = column-length of rowblocks; Tr*Tc = M + // for softmax N == M, i.e. Sin and Sout are square matrices + rc = alp::RC::SUCCESS; + + // input + alp::Tensor Qtensorin( alp::Datatype::FP16, alp::make_axes( 0, 2, 4 ) ); // shape = ( Tr,Br,d ) + alp::Tensor Ktensorin( alp::Datatype::FP16, alp::make_axes( 1, 3, 4 ) ); // shape = ( Tc,Bc,d ) + alp::Tensor Vtensorin( alp::Datatype::FP16, alp::make_axes( 1, 3, 4 ) ); // shape = ( Tc,Bc,d ) + + // temp + alp::Tensor Otensorout( alp::Datatype::FP16, alp::make_axes( 0, 2, 4 ) ); // shape = ( Tr,Br,d ) + alp::Tensor mtensorout( alp::Datatype::FP16, alp::make_axes( 0, 2 ) ); // shape = ( Tr,Br ) + alp::Tensor ltensorout( alp::Datatype::FP16, alp::make_axes( 0, 2 ) ); // shape = ( Tr,Br ) + + rc = !rc ? rc : grid.forEach( alp::make_axes( 0 ), [ & ] () { + + auto Q_block_in = Qtensorin.getView(); // T(2,4) + + auto O_block_out = Otensorout.getView(); // T(2,4) + auto m_block_out = mtensorout.getView(); // T(2) + auto l_block_out = ltensorout.getView(); // T(2) + + //TODO: fix, i.e. double replace with half + alp::set( m_block_out, -alp::Infinity ); + alp::set( l_block_out, alp::Zero ); + + rc = !rc ? rc : grid.forEach( alp::make_axes( 1 ), [ & ] () { + + // these tensors will have original axes with axes 0 and 1 removed + // Sij=S[i0,i1,:,:] + + auto K_block_in = Ktensorin.getView(); // T(3,4) + auto V_block_in = Vtensorin.getView(); // T(3,4) + + alp::Tensor Sij( alp::Datatype::FP16, alp::make_axes( 2, 3 ) ); + alp::Tensor Temp( alp::Datatype::FP16, alp::make_axes( 2, 3 ) ); + alp::Tensor rowmaxS( alp::Datatype::FP16, alp::make_axes( 2 ) ); + alp::Tensor mi_old( alp::Datatype::FP16, alp::make_axes( 2 ) ); + alp::Tensor expmidiff( alp::Datatype::FP16, alp::make_axes( 2 ) ); + + // T(2,3) T(2,4) T(3,4) + alp::apply( Sij, Q_block_in, K_block_in, "mxm", alp::make_axes( 4 ) ); + + // mi_old=cp.copy(mtensor[i,:]) + // T(2) T(2) + alp::set( mi_old, m_block_out); + + // rowmaxS=np.max(Si,axis=-1) + // T(2) T(2,3) + alp::apply( rowmaxS, Sij, "max", alp::make_axes( 3 ) ); + + // mtensor[i,:]=np.maximum(mtensor[i,:],rowmaxS) + // T(2) T(2) + alp::foldl( m_block_out, rowmaxS, "max" ); + + // Si=Si-np.expand_dims(mtensor[i,:], axis=-1) + // T(2,3) T(2) + alp::foldl( Sij, m_block_out, "minus", alp::make_axes( 3 ) ); + + // Si=np.exp(Si) + alp::foldl( Sij, "exp" ); + + // expmidiff=np.exp(mi_old-mtensor[i,:]) + // T(2) T(2) T(2) + alp::apply( expmidiff, mi_old, m_block_out, "minus" ); + + alp::foldl( expmidiff, "exp" ); + + // ltensor[i,:]*=expmidiff + // T(2) T(2) + alp::foldl( l_block_out, expmidiff, "times" ); + + // ltensor[i,:]+= np.sum(Si,axis=-1) + // T(2) T(2,3) + alp::foldl( l_block_out, Sij, "add", alp::make_axes( 3 ) ); + + // Otensor[i,:,:]*=np.expand_dims(expmidiff, axis=(-2,-1)) + // T(2,4) T(2) + alp::foldl( O_block_out, expmidiff, "times", alp::make_axes( 4 ) ); + + // T(2,3) T(2,4) T(3,4) + alp::apply( Temp, Sij, V_block_in, "mxm", alp::make_axes( 4 ) ); + // T(2,3) T(2,3) + alp::foldl( O_block_out, Temp , "add" ); + + } ); + + // Otensor[i,:,:]/=np.expand_dims(ltensor[i,:], axis=(-2,-1)) + // T(2,3) T(2) + alp::foldl( O_block_out, l_block_out, "divide", alp::make_axes( 3 ) ); + + //ltensor[i,:] = mtensor[i,:] + log(ltensor[i,:]) + // skip for now + + alp::store( O_block_out ); + alp::store( l_block_out ); + alp::store( m_block_out ); + + } ); + + return; +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + alp::RC error_code = alp::RC::SUCCESS; + try { + error_code = alp::compile< 1, 5 >( ascend_code, "KernelFlashattention" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != alp::RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/compile_and_run_flashattention.sh b/examples/compile_and_run_flashattention.sh new file mode 100755 index 000000000..3581335e3 --- /dev/null +++ b/examples/compile_and_run_flashattention.sh @@ -0,0 +1,67 @@ +#set current directory +CWD=$(pwd) + +bashargn=$# +#echo "bashargn $bashargn" +if [[ "$bashargn" == 2 ]] +then + opfile=$1 + hostfile=$2 +else + opfile="op.cpp" + hostfile="$CWD/flashattention_custom_main.cpp" + + #cleanup any previous output + rm -f a_npu input/*.bin output/output_z.bin *.o op.cpp + rm -rf /tmp/build_alp/ + + #build ALP code gnerator, i.e. ascend_flashattentionOp_ascend executable + mkdir /tmp/build_alp/ && cd /tmp/build_alp/ && cmake $CWD/../ && make ascend_flashattentionOp_ascend && cd $CWD + + #run ALP code generator generate, store it into op.cpp + /tmp/build_alp/examples/./ascend_flashattentionOp_ascend > op.cpp + + cat op.cpp +fi + +echo "compile: $opfile and $hostfile" + +#compile ascend code +# set the compiler path and the ASCEND_TOOLKIT_INSTALL_PATH +ASCEND_TOOLKIT_INSTALL_PATH="/usr/local/Ascend/ascend-toolkit/latest" +ccec_compiler="/home/HwHiAiUser/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec" + +#compile generated kernel code, i.e. $opfile +$ccec_compiler -xcce -DTILING_KEY_VAR=0 -I"$ASCEND_TOOLKIT_INSTALL_PATH/acllib/include" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/impl" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/interface" -I"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/include" -O2 -std=c++17 --cce-aicore-arch=dav-c100 --cce-auto-sync -fPIC -pthread -o $opfile.o -c $opfile + +#compile template host code, i.e. $hostfile +$ccec_compiler -xcce -DTILING_KEY_VAR=0 -I"$ASCEND_TOOLKIT_INSTALL_PATH/acllib/include" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/impl" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/interface" -I"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/include" -O2 -std=c++17 --cce-aicore-arch=dav-c100 --cce-auto-sync -fPIC -pthread -o $hostfile.o -c $hostfile + +#link the executable, i.e. a_npu +$ccec_compiler --cce-fatobj-link --cce-aicore-arch=dav-c100 $opfile.o $hostfile.o -o a_npu -L"$ASCEND_TOOLKIT_INSTALL_PATH/runtime/lib64" -L"$ASCEND_TOOLKIT_INSTALL_PATH/tools/simulator/Ascend910A/lib" -L"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/Ascend910A" -lstdc++ -lruntime -lascendcl + + +#generate input data in "input" directory +# and the reference output data in "output" directory +rm -f runtime*.csv +for n in {0..0} +do + rm -rf input output + #vec_length=$(( 8 * 2048 * ( 2 ** $n) )) + echo "generate input" + echo "python3 flashattention_custom.py ${vec_length}" + mkdir -p input + mkdir -p output + python3 flashattention_custom.py #${vec_length} + + #run ascend example, run ./a_npu on 910 + echo "run ascend example" + echo "./a_npu ${vec_length}" + ./a_npu #${vec_length} + + python3 flashattention_check.py + +done + + + diff --git a/examples/compile_and_run_onlinesoftmax.sh b/examples/compile_and_run_onlinesoftmax.sh new file mode 100755 index 000000000..45e18fa34 --- /dev/null +++ b/examples/compile_and_run_onlinesoftmax.sh @@ -0,0 +1,61 @@ +#set current directory +CWD=$(pwd) + +bashargn=$# +#echo "bashargn $bashargn" +if [[ "$bashargn" == 2 ]] +then + opfile=$1 + hostfile=$2 +else + opfile="op.cpp" + hostfile="$CWD/onlinesoftmax_custom_main.cpp" + + #cleanup any previous output + rm -f a_npu input/*.bin output/output_z.bin *.o op.cpp + rm -rf /tmp/build_alp/ + + #build ALP code gnerator, i.e. ascend_onlinesoftmaxOp_ascend executable + mkdir /tmp/build_alp/ && cd /tmp/build_alp/ && cmake $CWD/../ && make ascend_onlinesoftmaxOp_ascend && cd $CWD + + #run ALP code generator generate, store it into op.cpp + /tmp/build_alp/examples/./ascend_onlinesoftmaxOp_ascend > op.cpp + + cat op.cpp +fi + +echo "compile: $opfile and $hostfile" + +#compile ascend code +# set the compiler path and the ASCEND_TOOLKIT_INSTALL_PATH +ASCEND_TOOLKIT_INSTALL_PATH="/usr/local/Ascend/ascend-toolkit/latest" +ccec_compiler="/home/HwHiAiUser/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec" + +#compile generated kernel code, i.e. $opfile +$ccec_compiler -xcce -DTILING_KEY_VAR=0 -I"$ASCEND_TOOLKIT_INSTALL_PATH/acllib/include" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/impl" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/interface" -I"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/include" -O2 -std=c++17 --cce-aicore-arch=dav-c100 --cce-auto-sync -fPIC -pthread -o $opfile.o -c $opfile + +#compile template host code, i.e. $hostfile +$ccec_compiler -xcce -DTILING_KEY_VAR=0 -I"$ASCEND_TOOLKIT_INSTALL_PATH/acllib/include" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/impl" -I"$ASCEND_TOOLKIT_INSTALL_PATH/compiler/tikcpp/tikcfw/interface" -I"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/include" -O2 -std=c++17 --cce-aicore-arch=dav-c100 --cce-auto-sync -fPIC -pthread -o $hostfile.o -c $hostfile + +#link the executable, i.e. a_npu +$ccec_compiler --cce-fatobj-link --cce-aicore-arch=dav-c100 $opfile.o $hostfile.o -o a_npu -L"$ASCEND_TOOLKIT_INSTALL_PATH/runtime/lib64" -L"$ASCEND_TOOLKIT_INSTALL_PATH/tools/simulator/Ascend910A/lib" -L"$ASCEND_TOOLKIT_INSTALL_PATH/tools/tikicpulib/lib/Ascend910A" -lstdc++ -lruntime -lascendcl + +rm -f runtime*.csv +rm -rf input output +echo "generate input" +echo "python3 onlinesoftmax_custom.py" +mkdir -p input +mkdir -p output +python3 onlinesoftmax_custom.py + +#run ascend example, run ./a_npu on 910 +echo "run ascend example" +echo "./a_npu ${vec_length}" +./a_npu #${vec_length} + +#python3 onlinesoftmax_print.py +python3 softmax_check-v5.py +#echo "NO onlinesoftmax_custom.py" + + + diff --git a/examples/online_softmax.py b/examples/online_softmax.py new file mode 100644 index 000000000..295783627 --- /dev/null +++ b/examples/online_softmax.py @@ -0,0 +1,96 @@ +import numpy as np +import copy as cp + + +def simplesoftmax(S_in): + S=cp.copy(S_in) + # S=softmax(S) + rowmaxS=np.max(S,axis=1) + S=S-np.tile(rowmaxS, (np.shape(S)[0],1)).T + S=np.exp(S) + rowsumS=np.sum(S,axis=1) + S=S/(np.tile(rowsumS, (np.shape(S)[0],1)).T) + + return(S) + + +def onlinesoftmax(S_in,Br=5,Bc=4): + save_shape=S_in.shape + + N,x=S_in.shape + + #check for too large block sizes + Br=min(Br,N) + Bc=min(Bc,N) + #get number of row/column blocks + Tr=N//Br + if(Tr*Br!=N): + Tr+=1 + Tc=N//Bc + if(Tc*Bc!=N): + Tc+=1 + + # outputs + # Initialize om HBM + O=np.zeros((N,N)) + l=np.zeros(N) + m=np.zeros(N)-np.Infinity + + + #switch to tensors + # dimensions (Tr, Br, Tc, Bc, d) + Otensor=np.reshape(O,(Tr,Br,Tc,Bc)) + mtensor=np.reshape(m,(Tr,Br)) + ltensor=np.reshape(l,(Tr,Br)) + del(m,l) + + QKtensor=np.reshape(S_in,(Tr,Br,Tc,Bc)) + + + for i in range(Tr): + + for j in range(Tc): + Si=QKtensor[i,:,j,:] + + mi_old=cp.copy(mtensor[i,:]) + + rowmaxS=np.max(Si,axis=-1) + + mtensor[i,:]=np.maximum(mtensor[i,:],rowmaxS) + + Si=Si-np.expand_dims(mtensor[i,:], axis=-1) + + Si=np.exp(Si) + + expmidiff=np.exp(mi_old-mtensor[i,:]) + ltensor[i,:]*=expmidiff + + ltensor[i,:]+= np.sum(Si,axis=-1) + + Otensor[i,:,:,:]*=np.expand_dims(expmidiff, axis=(-2,-1)) + + Otensor[i,:,j,:]=Si + + Otensor[i,:,:,:]/=np.expand_dims(ltensor[i,:], axis=(-2,-1)) + + O=np.reshape(Otensor,(N,N)) + + return(O,ltensor) + + +shape1=(128,16) +Q=np.random.random(shape1) +K=np.random.random(shape1) +V=np.identity(shape1[0]) + +Stmp=Q.dot(K.T) + +Osimple=simplesoftmax(Stmp) +Oflash,llash=onlinesoftmax(Stmp,Br=8,Bc=4) + +print("difference=",np.linalg.norm(Osimple-Oflash)) + + + + + diff --git a/examples/onlinesoftmax_custom.py b/examples/onlinesoftmax_custom.py new file mode 100644 index 000000000..9f3668e1c --- /dev/null +++ b/examples/onlinesoftmax_custom.py @@ -0,0 +1,26 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import copy + +def gen_golden_data(): + x1_gm_type = np.float16 + + n0=16 + n1=32 + n2=16 + n3=16 + n4=16 + + N1 = n0*n1*n2*n3 + shape1 = (n0,n1,n2,n3) + + S0_gm = np.random.randint(1, 10, [N1]).astype(x1_gm_type) + infilename = "./input/s0_gm.bin" + S0_gm.tofile( infilename ) + + + +if __name__ == "__main__": + gen_golden_data() diff --git a/examples/onlinesoftmax_custom_main.cpp b/examples/onlinesoftmax_custom_main.cpp new file mode 100644 index 000000000..2a6a8958e --- /dev/null +++ b/examples/onlinesoftmax_custom_main.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * This file constains code of cpu debug and npu code.We read data from bin file + * and write result to file. + */ +#include +#include +#include +#include +#include + +#include "data_utils.h" + +#include "acl/acl.h" + +extern void custom_KernelOnlineSoftmax_do( + uint32_t coreDim, void* l2ctrl, void* stream, + uint8_t *param_Sin, uint8_t *param_Sout, uint8_t *param_m, uint8_t *param_l, + uint32_t _p, uint32_t n0, + uint32_t n1, uint32_t n2, uint32_t n3 ); + +#define DTYPE uint16_t + +constexpr uint32_t n0=16; +constexpr uint32_t n1=32; +constexpr uint32_t n2=16; +constexpr uint32_t n3=16; + +constexpr uint32_t N2 = n0*n2; +constexpr uint32_t N3 = n0*n1*n2*n3; + +#define REPS 20 + +int32_t main(int32_t argc, char* argv[]) +{ + size_t param_m_FileSize = N2 * sizeof( DTYPE ); + size_t param_l_FileSize = N2 * sizeof( DTYPE ); + size_t param_Sin_FileSize = N3 * sizeof( DTYPE ); + size_t param_Sout_FileSize = N3 * sizeof( DTYPE ); + uint32_t blockDim = 4; + + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + /////////////// allocate on host //////////////////////// + + uint8_t *param_m_Host; + CHECK_ACL(aclrtMallocHost((void**)(¶m_m_Host), param_m_FileSize)); + + uint8_t *param_l_Host; + CHECK_ACL(aclrtMallocHost((void**)(¶m_l_Host), param_l_FileSize)); + + uint8_t *param_Sin_Host; + CHECK_ACL(aclrtMallocHost((void**)(¶m_Sin_Host), param_Sin_FileSize)); + ReadFile("./input/s0_gm.bin", param_Sin_FileSize, param_Sin_Host, param_Sin_FileSize); + + uint8_t *param_Sout_Host; + CHECK_ACL(aclrtMallocHost((void**)(¶m_Sout_Host), param_Sout_FileSize)); + + /////////////// allocate on device //////////////////////// + + uint8_t *param_m_Device; + CHECK_ACL(aclrtMalloc((void**)¶m_m_Device, param_m_FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + uint8_t *param_l_Device; + CHECK_ACL(aclrtMalloc((void**)¶m_l_Device, param_l_FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + uint8_t *param_Sin_Device; + CHECK_ACL(aclrtMalloc((void**)¶m_Sin_Device, param_Sin_FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + uint8_t *param_Sout_Device; + CHECK_ACL(aclrtMalloc((void**)¶m_Sout_Device, param_Sout_FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + std::vector< double > meas_vec( REPS ); + + for ( auto i = 0; i < REPS; ++i ) { + CHECK_ACL(aclrtMemcpy(param_Sin_Device, param_Sin_FileSize, param_Sin_Host, param_Sin_FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + std::cout << "Softmax rep " << i << std::endl; + auto begin = std::chrono::high_resolution_clock::now(); + + custom_KernelOnlineSoftmax_do( + blockDim, nullptr, stream, + param_Sin_Device, param_Sout_Device, + param_m_Device, param_l_Device, + blockDim, n0, n1, n2, n3 + ); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + auto end = std::chrono::high_resolution_clock::now(); + meas_vec[ i ] = static_cast< double >( std::chrono::duration_cast(end-begin).count() ); + } + + std::sort( meas_vec.begin(), meas_vec.end() ); + auto avg = std::accumulate( meas_vec.cbegin(), meas_vec.cend(), 0. ) / meas_vec.size(); + auto min = *( std::min_element( meas_vec.cbegin(), meas_vec.cend() ) ); + auto max = *( std::max_element( meas_vec.cbegin(), meas_vec.cend() ) ); + auto size = meas_vec.size(); + auto med = ( size % 2 == 0 ) ? ( meas_vec[ size / 2 - 1 ] + meas_vec[ size / 2 ] ) / 2 : meas_vec[ size / 2 ]; + std::cout << "Measured Time (avg, ms): " << avg * 1e-6 << std::endl; + std::cout << " (min, ms): " << min * 1e-6 << std::endl; + std::cout << " (max, ms): " << max * 1e-6 << std::endl; + std::cout << " (med, ms): " << med * 1e-6 << std::endl; + + + CHECK_ACL(aclrtMemcpy(param_m_Host, param_m_FileSize, param_m_Device, param_m_FileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + CHECK_ACL(aclrtMemcpy(param_l_Host, param_l_FileSize, param_l_Device, param_l_FileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + CHECK_ACL(aclrtMemcpy(param_Sout_Host, param_Sout_FileSize, param_Sout_Device, param_Sout_FileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + + WriteFile("./output/output_s1.bin", param_Sout_Host, param_Sout_FileSize); + CHECK_ACL(aclrtFreeHost(param_Sin_Host)); + CHECK_ACL(aclrtFreeHost(param_Sout_Host)); + + CHECK_ACL(aclrtFree(param_Sin_Device)); + CHECK_ACL(aclrtFree(param_Sout_Device)); + + WriteFile("./output/output_m.bin", param_m_Host, param_m_FileSize); + WriteFile("./output/output_l.bin", param_l_Host, param_l_FileSize); + CHECK_ACL(aclrtFreeHost(param_l_Host)); + CHECK_ACL(aclrtFreeHost(param_m_Host)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); + + return 0; +} diff --git a/examples/softmax_custom-v1-1.cpp b/examples/softmax_custom-v1-1.cpp new file mode 100644 index 000000000..9eae29a99 --- /dev/null +++ b/examples/softmax_custom-v1-1.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ + +#include "kernel_operator.h" + +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue + +__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) { + return (a + b - 1) / b; +} + +//template < typename T > +class KernelSoftmax { +public: + __aicore__ inline KernelSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2 ) { + p0 = _p0; + p1 = 1; + p2 = 1; + + n0 = _n0; + n1 = _n1; + n2 = _n2; + + block_length0 = ( n0 * n1 * n2 ) / ( p0 * p1 * p2 ); + tile_length0 = ( n1 * n2 ) / BUFFER_NUM; + + } + + __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 ) + { + + // get start index for current core, core parallel + _tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 + block_length0 * GetBlockIdx(), block_length0); + _tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0); + + // Min workspace for reduction ops. + // Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual + // at Secs. 8.1.5.10.1 and 8.1.5.10.3 + ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half ); + int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half ); + int32_t firstMaxRepeat = n2 / elementsPerRepeat; + int32_t iter1OutputCount = firstMaxRepeat * 2; + int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk; + + totWorkSpaceSize = ( + ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce + + n2 + ) * sizeof( half ); + + pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM, n2 * sizeof( half ) ); + pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, n2 * sizeof( half ) ); + + } + + __aicore__ inline void Process() + { + // loop count ( including effect of using BUFFER_NUM ) + const uint32_t loopCount0 = ( n0 * BUFFER_NUM ) / p0; + for (uint32_t i0 = 0; i0 < loopCount0; i0++) { + uint32_t i = i0; + + pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize ); + _tensor5_0temp = tempBuf_tensor5_0.Get< half >( ); + pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize ); + _tensor6_0temp = tempBuf_tensor6_0.Get< half >( ); + + pipe.InitBuffer( localBuf_tensor4_0, n1 ); + _tensor4_0Gm = localBuf_tensor4_0.Get< half >( ); // _tensor4_0Gm comes from API + + + // This loop comes from axis 1, does not need data movement + // For now process a tile row by row + const uint32_t loopCount1 = n1 / BUFFER_NUM; + for( uint32_t i1 = 0; i1 < n1 ; ++i1 ) { + CopyIn0(i0,i1); + Compute0( i1 ); + CopyOut0(i0,i1); + } + // free input tensors for reuse + // inQueue_tensor0_0.FreeTensor( _tensor0Local ); + } + } + + +private: + + __aicore__ inline void CopyIn0( + uint32_t _i0, uint32_t _i1 + ) { + // alloc tensor from queue memory + _tensor0Local = inQueue_tensor0_0.AllocTensor< half >(); + // copy progress_th tile from global tensor to local tensor + DataCopy( _tensor0Local, _tensor0_0Gm[ _i0 * n1 * n2 + _i1 * n2 ], n2 ); + // enque input tensors to VECIN queue + inQueue_tensor0_0.EnQue( _tensor0Local ); + + // deque input tensors from VECIN queue + _tensor0Local = inQueue_tensor0_0.DeQue< half >(); + _tensor1Local = outQueue_tensor1_0.AllocTensor< half >(); + + } + __aicore__ inline void Compute0(uint32_t _i1) + { + // apply( _tensor4_0Gm, S_block_in, "max", make_axes(2) ) + ReduceMax( _tensor5_0temp, _tensor0Local, _tensor5_0temp[ ascend_el_per_blk ], n2, false ); + half max_ = _tensor5_0temp.GetValue( 0 ); + Duplicate( _tensor4_0Gm, max_, n2 ); // broadcast + + // apply( S_block_out, S_block_in, _tensor4_0Gm, "minus", make_axes(2) ); + Sub( _tensor1Local, _tensor0Local, _tensor4_0Gm, n2 ); + + // foldl( S_block_out, "exp" ); + Exp( _tensor1Local, _tensor1Local, n2 ); + + // apply( _tensor4_0Gm, S_block_out, "add", make_axes(2) ); + ReduceSum( _tensor6_0temp, _tensor1Local, _tensor6_0temp[ ascend_el_per_blk ], n2 ); + half rec_sum_ = _tensor6_0temp.GetValue( 0 ); + Duplicate( _tensor4_0Gm, rec_sum_, n2 ); // broadcast + + // foldl( S_block_out, _tensor4_0Gm, "divide", make_axes(2) ); + Div( _tensor1Local, _tensor1Local, _tensor4_0Gm, n2 ); + + } + __aicore__ inline void CopyOut0( + uint32_t _i0, uint32_t _i1 + ) { + outQueue_tensor1_0.EnQue< half >( _tensor1Local ); + // free input tensors for reuse + inQueue_tensor0_0.FreeTensor( _tensor0Local ); + + // deque output tensor from VECOUT queue + _tensor1Local = outQueue_tensor1_0.DeQue< half >(); + DataCopy( _tensor1_0Gm[ _i0 * n1 * n2 + _i1 * n2 ], _tensor1Local, n2 ); + // free output tensor for reuse + outQueue_tensor1_0.FreeTensor( _tensor1Local ); + } + + private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueue_tensor0_0; + // create queue for output, in this case depth is equal to buffer num + TQue outQueue_tensor1_0; + + uint32_t p0, p1, p2, n0, n1, n2; + uint32_t block_length0, tile_length0; + int32_t ascend_el_per_blk, totWorkSpaceSize; + + GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm; + LocalTensor< half > _tensor0Local; + LocalTensor< half > _tensor1Local; + LocalTensor< half > _tensor5_0temp; + LocalTensor< half > _tensor6_0temp; + LocalTensor< half > _tensor4_0Gm; + + TBuf< QuePosition::VECCALC > tempBuf_tensor5_0; + TBuf< QuePosition::VECCALC > tempBuf_tensor6_0; + TBuf< QuePosition::VECCALC > localBuf_tensor4_0; + +}; + +extern "C" __global__ __aicore__ void custom_KernelSoftmax( + GM_ADDR in, GM_ADDR out, + uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 ) { + KernelSoftmax op(_p, _n0, _n1, _n2 ); + op.Init( in, out ); + op.Process(); +} + +#ifndef __CCE_KT_TEST__ +// call of kernel function +void custom_KernelSoftmax_do( uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 ) +{ + custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2 ); +} +#endif diff --git a/examples/softmax_custom-v1.cpp b/examples/softmax_custom-v1.cpp new file mode 100644 index 000000000..fc17296be --- /dev/null +++ b/examples/softmax_custom-v1.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ + +#include "kernel_operator.h" + +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue + +__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) { + return (a + b - 1) / b; +} + +//template < typename T > +class KernelSoftmax { +public: + __aicore__ inline KernelSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2 ) { + p0 = _p0; + p1 = 1; + p2 = 1; + + n0 = _n0; + n1 = _n1; + n2 = _n2; + + block_length0 = ( n0 * n1 * n2 ) / ( p0 * p1 * p2 ); + tile_length0 = ( n1 * n2 ) / BUFFER_NUM; + + } + + __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 ) + { + + // get start index for current core, core parallel + _tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 + block_length0 * GetBlockIdx(), block_length0); + _tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0); + + // Min workspace for reduction ops. + // Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual + // at Secs. 8.1.5.10.1 and 8.1.5.10.3 + ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half ); + int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half ); + int32_t firstMaxRepeat = n2 / elementsPerRepeat; + int32_t iter1OutputCount = firstMaxRepeat * 2; + int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk; + + totWorkSpaceSize = ( + ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce + + n2 + ) * sizeof( half ); + + pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM, tile_length0 * sizeof( half ) ); + pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, tile_length0 * sizeof( half ) ); + + } + + __aicore__ inline void Process() + { + // loop count ( including effect of using BUFFER_NUM ) + const uint32_t loopCount0 = ( n0 * BUFFER_NUM ) / p0; + for (uint32_t i0 = 0; i0 < loopCount0; i0++) { + uint32_t i = i0; + + CopyIn0(i); + + // This loop comes from axis 1, does not need data movement + // For now process a tile row by row + const uint32_t loopCount1 = n1 / BUFFER_NUM; + for( uint32_t i1 = 0; i1 < loopCount1 ; ++i1 ) { + Compute0( i1 ); + } + + // free input tensors for reuse + // inQueue_tensor0_0.FreeTensor( _tensor0Local ); + + CopyOut0(i); + } + } + + +private: + + __aicore__ inline void CopyIn0(uint32_t progress) + { + // alloc tensor from queue memory + _tensor0Local = inQueue_tensor0_0.AllocTensor< half >(); + // copy progress_th tile from global tensor to local tensor + DataCopy( _tensor0Local, _tensor0_0Gm[ progress * tile_length0 ], tile_length0 ); + // enque input tensors to VECIN queue + inQueue_tensor0_0.EnQue( _tensor0Local ); + + // deque input tensors from VECIN queue + _tensor0Local = inQueue_tensor0_0.DeQue< half >(); + _tensor1Local = outQueue_tensor1_0.AllocTensor< half >(); + + pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize ); + _tensor5_0temp = tempBuf_tensor5_0.Get< half >( ); + pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize ); + _tensor6_0temp = tempBuf_tensor6_0.Get< half >( ); + + pipe.InitBuffer( localBuf_tensor4_0, n1 ); + _tensor4_0Gm = localBuf_tensor4_0.Get< half >( ); // _tensor4_0Gm comes from API + + } + __aicore__ inline void Compute0(uint32_t _i1) + { + // apply( _tensor4_0Gm, S_block_in, "max", make_axes(2) ) + ReduceMax( _tensor5_0temp[ 0 ], _tensor0Local[ _i1 * n2 ], _tensor5_0temp[ ascend_el_per_blk ], n2, false ); + half max_ = _tensor5_0temp[ 0 ].GetValue( 0 ); + Duplicate( _tensor4_0Gm, max_, n2 ); // broadcast + + // apply( S_block_out, S_block_in, _tensor4_0Gm, "minus", make_axes(2) ); + Sub( _tensor1Local[ _i1 * n2 ], _tensor0Local[ _i1 * n2 ], _tensor4_0Gm, n2 ); + + // foldl( S_block_out, "exp" ); + Exp( _tensor1Local[ _i1 * n2 ], _tensor1Local[ _i1 * n2 ], n2 ); + + // apply( _tensor4_0Gm, S_block_out, "add", make_axes(2) ); + ReduceSum( _tensor6_0temp[ 0 ], _tensor1Local[ _i1 * n2 ], _tensor6_0temp[ ascend_el_per_blk ], n2 ); + half rec_sum_ = _tensor6_0temp[ 0 ].GetValue( 0 ); + Duplicate( _tensor4_0Gm, rec_sum_, n2 ); // broadcast + + // foldl( S_block_out, _tensor4_0Gm, "divide", make_axes(2) ); + Div( _tensor1Local[ _i1 * n2 ], _tensor1Local[ _i1 * n2 ], _tensor4_0Gm, n2 ); + + } + __aicore__ inline void CopyOut0(uint32_t progress) + { + outQueue_tensor1_0.EnQue< half >( _tensor1Local ); + // free input tensors for reuse + inQueue_tensor0_0.FreeTensor( _tensor0Local ); + + // deque output tensor from VECOUT queue + _tensor1Local = outQueue_tensor1_0.DeQue< half >(); + DataCopy( _tensor1_0Gm[ progress * tile_length0 ], _tensor1Local, tile_length0 ); + // free output tensor for reuse + outQueue_tensor1_0.FreeTensor( _tensor1Local ); + } + + private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueue_tensor0_0; + // create queue for output, in this case depth is equal to buffer num + TQue outQueue_tensor1_0; + + uint32_t p0, p1, p2, n0, n1, n2; + uint32_t block_length0, tile_length0; + int32_t ascend_el_per_blk, totWorkSpaceSize; + + GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm; + LocalTensor< half > _tensor0Local; + LocalTensor< half > _tensor1Local; + LocalTensor< half > _tensor5_0temp; + LocalTensor< half > _tensor6_0temp; + LocalTensor< half > _tensor4_0Gm; + + TBuf< QuePosition::VECCALC > tempBuf_tensor5_0; + TBuf< QuePosition::VECCALC > tempBuf_tensor6_0; + TBuf< QuePosition::VECCALC > localBuf_tensor4_0; + +}; + +extern "C" __global__ __aicore__ void custom_KernelSoftmax( + GM_ADDR in, GM_ADDR out, + uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 ) { + KernelSoftmax op(_p, _n0, _n1, _n2 ); + op.Init( in, out ); + op.Process(); +} + +#ifndef __CCE_KT_TEST__ +// call of kernel function +void custom_KernelSoftmax_do( uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 ) +{ + custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2 ); +} +#endif diff --git a/examples/softmax_custom-v3.cpp b/examples/softmax_custom-v3.cpp new file mode 100644 index 000000000..94e43f9c9 --- /dev/null +++ b/examples/softmax_custom-v3.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ + +#include "kernel_operator.h" +#include "ascendlib.hpp" + +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue + +__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) { + return (a + b - 1) / b; +} + +//template < typename T > +class KernelSoftmax { +public: + __aicore__ inline KernelSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2, const uint32_t _n3 ) { + p0 = _p0; + p1 = 1; + p2 = 1; + p3 = 1; + + n0 = _n0; + n1 = _n1; + n2 = _n2; + n3 = _n3; + + block_length0 = ( n0 * n1 * n2 * n3 ) / ( p0 * p1 * p2 * p3 ); + tile_length0 = ( n1 * n2 * n3 ) / BUFFER_NUM; + + } + + __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 ) + { + + // get start index for current core, core parallel + _tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 + block_length0 * GetBlockIdx(), block_length0); + _tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0); + + // Min workspace for reduction ops. + // Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual + // at Secs. 8.1.5.10.1 and 8.1.5.10.3 + ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half ); + int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half ); + int32_t firstMaxRepeat = n3 / elementsPerRepeat; + int32_t iter1OutputCount = firstMaxRepeat * 2; + int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk; + + totWorkSpaceSize = ( + ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce + + n3 + ) * sizeof( half ); + + pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM, tile_length0 * sizeof( half ) ); + pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, tile_length0 * sizeof( half ) ); + + } + + __aicore__ inline void Process() + { + tempBuffInit(); + + const uint32_t loopCount0 = n0 / p0; + for (uint32_t i0 = 0; i0 < loopCount0; i0++) { + + const uint32_t loopCount1 = n1 ; + for (uint32_t i1 = 0; i1 < n1; i1++) { + + uint32_t gm_pointer = i0*n1*n2*n3 + i1*n2*n3; + //uint32_t blocklen=n3; + uint32_t stride=n3; + //uint32_t nblocks=n2; + + CopyIn0(gm_pointer,n3,stride,n2); + + // apply( _tensor2Local, S_block_in, "max", make_axes(2) ) + alp::BlockReduceMax( _tensor2Local, _tensor0Local, _tensor5_0temp[ ascend_el_per_blk ], n2, n3 ); + + + // apply( S_block_out, S_block_in, _tensor2Local, "minus", make_axes(2) ); + alp::BlockBcastMinus( _tensor1Local, _tensor0Local, _tensor2Local, _tensor5_0temp, n2, n3 ); + + alp::BlockExp( _tensor1Local, _tensor1Local, n2, n3 ); + + // apply( _tensor2Local, S_block_out, "add", make_axes(2) ); + alp::BlockReduceSum( _tensor2Local, _tensor1Local, _tensor6_0temp[ ascend_el_per_blk ], n2, n3 ); + + + // foldl( S_block_out, _tensor2Local, "divide", make_axes(2) ); + alp::BlockBcastDivide( _tensor1Local, _tensor1Local, _tensor2Local, _tensor5_0temp, n2, n3 ); + + CopyOut0(gm_pointer,n3,stride,n2); + + } + } + + } + + +private: + + + __aicore__ inline void tempBuffInit() { + + pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize ); + _tensor5_0temp = tempBuf_tensor5_0.Get< half >( ); + + pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize ); + _tensor6_0temp = tempBuf_tensor6_0.Get< half >( ); + + pipe.InitBuffer( localBuf_tensor4_0, n2 ); + _tensor2Local = localBuf_tensor4_0.Get< half >( ); // _tensor2Local comes from API + } + + + __aicore__ inline void CopyIn0( + uint32_t gm_pointer, uint32_t blocklen, uint32_t stride, uint32_t nblocks + ) + { + // alloc tensor from queue memory + _tensor0Local = inQueue_tensor0_0.AllocTensor< half >(); + // copy progress_th tile from global tensor to local tensor + + // DataCopyParams dcpy_param; + // dcpy_param.blockCount=nblocks; + // dcpy_param.blockLen =blocklen; + // dcpy_param.srcStride =stride; + // dcpy_param.dstStride =0; + // DataCopy( _tensor0Local, _tensor0_0Gm[ gm_pointer ], dcpy_param ); + // DataCopy( _tensor0Local, _tensor0_0Gm[ gm_pointer ], blocklen ); + for( uint32_t k = 0; k < nblocks ; ++k ) { + DataCopy( _tensor0Local[ k*blocklen ], _tensor0_0Gm[ gm_pointer + k*stride ], blocklen ); + } + + // enque input tensors to VECIN queue + inQueue_tensor0_0.EnQue( _tensor0Local ); + + // deque input tensors from VECIN queue + _tensor0Local = inQueue_tensor0_0.DeQue< half >(); + _tensor1Local = outQueue_tensor1_0.AllocTensor< half >(); + + } + + __aicore__ inline void CopyOut0( + uint32_t gm_pointer, uint32_t blocklen, uint32_t stride, uint32_t nblocks + ) { + outQueue_tensor1_0.EnQue< half >( _tensor1Local ); + // free input tensors for reuse + inQueue_tensor0_0.FreeTensor( _tensor0Local ); + + // deque output tensor from VECOUT queue + _tensor1Local = outQueue_tensor1_0.DeQue< half >(); + + // DataCopyParams dcpy_param; + // dcpy_param.blockCount=nblocks; + // dcpy_param.blockLen =blocklen; + // dcpy_param.srcStride =0; + // dcpy_param.dstStride =stride; + // DataCopy( _tensor1_0Gm[ gm_pointer ], _tensor1Local, dcpy_param ); + // DataCopy( _tensor1_0Gm[ gm_pointer ], _tensor1Local, blocklen ); + for( uint32_t k = 0; k < nblocks ; ++k ) { + DataCopy( _tensor1_0Gm[ gm_pointer + k*stride ], _tensor1Local[ k*blocklen ], blocklen ); + } + + // free output tensor for reuse + outQueue_tensor1_0.FreeTensor( _tensor1Local ); + } + + private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueue_tensor0_0; + // create queue for output, in this case depth is equal to buffer num + TQue outQueue_tensor1_0; + + uint32_t p0, p1, p2, p3, n0, n1, n2, n3; + uint32_t block_length0, tile_length0; + int32_t ascend_el_per_blk, totWorkSpaceSize; + + GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm; + LocalTensor< half > _tensor0Local; + LocalTensor< half > _tensor1Local; + + LocalTensor< half > _tensor5_0temp; + LocalTensor< half > _tensor6_0temp; + LocalTensor< half > _tensor2Local; + + TBuf< QuePosition::VECCALC > tempBuf_tensor5_0; + TBuf< QuePosition::VECCALC > tempBuf_tensor6_0; + TBuf< QuePosition::VECCALC > localBuf_tensor4_0; + +}; + +extern "C" __global__ __aicore__ void custom_KernelSoftmax( + GM_ADDR in, GM_ADDR out, + uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3 ) { + KernelSoftmax op(_p, _n0, _n1, _n2, _n3 ); + op.Init( in, out ); + op.Process(); +} + +#ifndef __CCE_KT_TEST__ +// call of kernel function +void custom_KernelSoftmax_do( uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3 ) +{ + custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2, _n3 ); +} +#endif diff --git a/examples/softmax_custom-v4.cpp b/examples/softmax_custom-v4.cpp new file mode 100644 index 000000000..77b69ebf0 --- /dev/null +++ b/examples/softmax_custom-v4.cpp @@ -0,0 +1,228 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ + +#include "kernel_operator.h" +#include "ascendlib.hpp" + +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue + +__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) { + return (a + b - 1) / b; +} + +//template < typename T > +class KernelSoftmax { +public: +/* + __aicore__ inline KernelSoftmax( + const uint32_t _p0, + const uint32_t _n0, + const uint32_t _n1, + const uint32_t _n2, + const uint32_t _n3, + const uint32_t _n4, + const uint32_t _n5 + ) { + p0 = _p0; + p1 = 1; + p2 = 1; + p3 = 1; + p4 = 1; + p5 = 1; + + n0 = _n0; + n1 = _n1; + n2 = _n2; + n3 = _n3; + n4 = _n4; + n5 = _n5; + + block_length0 = ( n0 * n1 * n2 * n3 * n4 * n5 ) / ( p0 * p1 * p2 * p3 * p4 * p5 ); + tile_length0 = ( n1 * n2 * n3 * n4 * n5 ) / BUFFER_NUM; + + } +*/ + __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 ) + { + + // get start index for current core, core parallel + _tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 + block_length0 * GetBlockIdx(), block_length0); + _tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0); + + // Min workspace for reduction ops. + // Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual + // at Secs. 8.1.5.10.1 and 8.1.5.10.3 + ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half ); + int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half ); + int32_t firstMaxRepeat = n3 / elementsPerRepeat; + int32_t iter1OutputCount = firstMaxRepeat * 2; + int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk; + + totWorkSpaceSize = ( + ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce + + n3 + ) * sizeof( half ); + + pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM, tile_length0 * sizeof( half ) ); + pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, tile_length0 * sizeof( half ) ); + + } + + __aicore__ inline void Process() + { + tempBuffInit(); + + // loop count ( including effect of using BUFFER_NUM ) + + const uint32_t loopCount0 = n0 / p0; + for (uint32_t i0 = 0; i0 < loopCount0; i0++) { + + for (uint32_t i1 = 0; i1 < n1; i1++) { + // no loop i2 + for (uint32_t i3 = 0; i3 < n3; i3++) { + for (uint32_t i4 = 0; i4 < n4; i4++) { + // no loop i5 + + + uint32_t gm_pointer = i0*n1*n2*n3*n4*n5 + i1*n2*n3*n4*n5 + i3*n4*n5 + i4*n5; + uint32_t blocklen=n5; + uint32_t stride=n3*n4*n5; + uint32_t nblocks=n2; + + CopyIn0(gm_pointer,blocklen,stride,nblocks); + + alp::BlockReduceMax( _tensor4_0Gm, _tensor0Local, _tensor5_0temp[ ascend_el_per_blk ], nblocks, blocklen ); + + alp::BlockBcastMinus( _tensor1Local, _tensor0Local, _tensor4_0Gm, _tensor5_0temp, nblocks, blocklen ); + + alp::BlockExp( _tensor1Local, _tensor1Local, nblocks, blocklen ); + + alp::BlockReduceSum( _tensor4_0Gm, _tensor1Local, _tensor5_0temp[ ascend_el_per_blk ], nblocks, blocklen ); + + alp::BlockBcastDivide( _tensor1Local, _tensor1Local, _tensor4_0Gm, _tensor5_0temp, nblocks, blocklen ); + + CopyOut0(gm_pointer,blocklen,stride,nblocks); + + } + } + } + } + } + + +private: + + + __aicore__ inline void tempBuffInit() { + + pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize ); + _tensor5_0temp = tempBuf_tensor5_0.Get< half >( ); + + // pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize ); + // _tensor6_0temp = tempBuf_tensor6_0.Get< half >( ); + + pipe.InitBuffer( localBuf_tensor4_0, n2 ); + _tensor4_0Gm = localBuf_tensor4_0.Get< half >( ); // _tensor4_0Gm comes from API + } + + + __aicore__ inline void CopyIn0( + uint32_t gm_pointer, uint32_t blocklen, uint32_t stride, uint32_t nblocks + ) + { + // alloc tensor from queue memory + _tensor0Local = inQueue_tensor0_0.AllocTensor< half >(); + // copy progress_th tile from global tensor to local tensor + + // DataCopyParams dcpy_param; + // dcpy_param.blockCount=nblocks; + // dcpy_param.blockLen =blocklen; + // dcpy_param.srcStride =stride; + // dcpy_param.dstStride =0; + // DataCopy( _tensor0Local, _tensor0_0Gm[ gm_pointer ], dcpy_param ); + // DataCopy( _tensor0Local, _tensor0_0Gm[ gm_pointer ], blocklen ); + for( uint32_t k = 0; k < nblocks ; ++k ) { + DataCopy( _tensor0Local[ k*blocklen ], _tensor0_0Gm[ gm_pointer + k*stride ], blocklen ); + } + + // enque input tensors to VECIN queue + inQueue_tensor0_0.EnQue( _tensor0Local ); + + // deque input tensors from VECIN queue + _tensor0Local = inQueue_tensor0_0.DeQue< half >(); + _tensor1Local = outQueue_tensor1_0.AllocTensor< half >(); + + + } + + __aicore__ inline void CopyOut0( + uint32_t gm_pointer, uint32_t blocklen, uint32_t stride, uint32_t nblocks + ) + { + outQueue_tensor1_0.EnQue< half >( _tensor1Local ); + // free input tensors for reuse + inQueue_tensor0_0.FreeTensor( _tensor0Local ); + + // deque output tensor from VECOUT queue + _tensor1Local = outQueue_tensor1_0.DeQue< half >(); + + // DataCopyParams dcpy_param; + // dcpy_param.blockCount=nblocks; + // dcpy_param.blockLen =blocklen; + // dcpy_param.srcStride =0; + // dcpy_param.dstStride =stride; + // DataCopy( _tensor1_0Gm[ gm_pointer ], _tensor1Local, dcpy_param ); + // DataCopy( _tensor1_0Gm[ gm_pointer ], _tensor1Local, blocklen ); + for( uint32_t k = 0; k < nblocks ; ++k ) { + DataCopy( _tensor1_0Gm[ gm_pointer + k*stride ], _tensor1Local[ k*blocklen ], blocklen ); + } + + // free output tensor for reuse + outQueue_tensor1_0.FreeTensor( _tensor1Local ); + } + + private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueue_tensor0_0; + // create queue for output, in this case depth is equal to buffer num + TQue outQueue_tensor1_0; + + uint32_t p0, p1, p2, p3, p4, p5; + uint32_t n0, n1, n2, n3, n4, n5; + uint32_t block_length0, tile_length0; + int32_t ascend_el_per_blk, totWorkSpaceSize; + + GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm; + LocalTensor< half > _tensor0Local; + LocalTensor< half > _tensor1Local; + LocalTensor< half > _tensor5_0temp; + // LocalTensor< half > _tensor6_0temp; + LocalTensor< half > _tensor4_0Gm; + + TBuf< QuePosition::VECCALC > tempBuf_tensor5_0; + // TBuf< QuePosition::VECCALC > tempBuf_tensor6_0; + TBuf< QuePosition::VECCALC > localBuf_tensor4_0; + +}; + +extern "C" __global__ __aicore__ void custom_KernelSoftmax( + GM_ADDR in, GM_ADDR out, + uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3, uint32_t _n4, uint32_t _n5 ) { + KernelSoftmax op(_p, _n0, _n1, _n2, _n3, _n4, _n5 ); + op.Init( in, out ); + op.Process(); +} + +#ifndef __CCE_KT_TEST__ +// call of kernel function +void custom_KernelSoftmax_do( + uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p, + uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3, uint32_t _n4, uint32_t _n5 +) { + custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2, _n3, _n4, _n5 ); +} +#endif diff --git a/examples/softmax_custom-v5.cpp b/examples/softmax_custom-v5.cpp new file mode 100644 index 000000000..4dd038f0a --- /dev/null +++ b/examples/softmax_custom-v5.cpp @@ -0,0 +1,268 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ + +#include "kernel_operator.h" +#include "ascendlib.hpp" + +#define TMP_MXM + +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue + +__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) { + return (a + b - 1) / b; +} + +//template < typename T > +class KernelOnlineSoftmax { +public: + __aicore__ inline KernelOnlineSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2, const uint32_t _n3 ) { + p0 = _p0; + p1 = 1; + p2 = 1; + p3 = 1; + + n0 = _n0; // Tr + n1 = _n1; // Tc + n2 = _n2; // Br + n3 = _n3; // Bc // Sij(Br,Bc) + + block_length_out1 = ( n0 * n2 ) / ( p0 * p2 ); + tile_length_out1 = ( n2 ) / BUFFER_NUM; + + block_length_out2 = ( n0 * n2 ) / ( p0 * p2 ); + tile_length_out2 = ( n2 ) / BUFFER_NUM; + } + + __aicore__ inline void Init( + GM_ADDR tensorOut1, GM_ADDR tensorOut2, + GM_ADDR tensorS0, GM_ADDR tensorS1 + ) { + + // get start index for current core, core parallel + + _tensorOutm_Gm.SetGlobalBuffer( (__gm__ half *)tensorOut1 + block_length_out1 * GetBlockIdx(), block_length_out1 ); + _tensorOutl_Gm.SetGlobalBuffer( (__gm__ half *)tensorOut2 + block_length_out2 * GetBlockIdx(), block_length_out2 ); + pipe.InitBuffer( outQueue_tensor_l, BUFFER_NUM, tile_length_out1 * sizeof( half ) ); + pipe.InitBuffer( outQueue_tensor_m, BUFFER_NUM, tile_length_out2 * sizeof( half ) ); + + uint32_t block_length_in_s = ( n0 * n1 * n2 * n3 ) / ( p0 * p1 * p2 * p3 ); + uint32_t tile_length_in_s = ( n1 * n2 * n3 ) / BUFFER_NUM; + + _tensorS0_Gm.SetGlobalBuffer( (__gm__ half *)tensorS0 + block_length_in_s * GetBlockIdx(), block_length_in_s ); + _tensorS1_Gm.SetGlobalBuffer( (__gm__ half *)tensorS1 + block_length_in_s * GetBlockIdx(), block_length_in_s ); + pipe.InitBuffer( inQueue_tensor_S0, BUFFER_NUM, n1*n2*n3 * sizeof( half ) ); + pipe.InitBuffer( outQueue_tensor_S1, BUFFER_NUM, n1*n2*n3 * sizeof( half ) ); + + + // Min workspace for reduction ops. + // Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual + // at Secs. 8.1.5.10.1 and 8.1.5.10.3 + ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half ); + int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half ); + int32_t firstMaxRepeat = n3 / elementsPerRepeat; + int32_t iter1OutputCount = firstMaxRepeat * 2; + int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk; + + totWorkSpaceSize = ( + ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce + + n3 + ) * sizeof( half ); + + + pipe.InitBuffer( tempBuf_alltensors, totWorkSpaceSize + 3 * n2 ); + _tensor_Work4 = tempBuf_alltensors.Get< half >(); + + // 0: + // ascend_el_per_blk: TEMP / HIDDEN + // rowmaxS: totWorkSpaceSize + // mi_old: rowmaxS + n2; + // expmidiff: mi_old + n2; + // + // + expmidiff= totWorkSpaceSize; + mi_old = rowmaxS + n2; + rowmaxS = mi_old + n2; + + } + + __aicore__ inline void Process() + { + half Zero = 0; + + const uint32_t loopCount0 = n0 / p0; + for (uint32_t i0 = 0; i0 < loopCount0; i0++) { + + //*******************************// + // auto m_block_out = mtensorout.getView(); // T(2) + _tensor_m_i0 = outQueue_tensor_m.AllocTensor< half >(); + outQueue_tensor_m.EnQue( _tensor_m_i0 ); + _tensor_m_i0 = outQueue_tensor_m.DeQue< half >(); + + + // alp::set( m_block_out, -alp::Infinity ); + half mInf = -65504.0; //---- + Duplicate( _tensor_m_i0, mInf, n2 ); //---- //TODO SET scalar + + + // DataCopy here + //*******************************// + // auto l_block_out = ltensorout.getView(); // T(2) + _tensor_l_i0 = outQueue_tensor_l.AllocTensor< half >(); + outQueue_tensor_l.EnQue( _tensor_l_i0 ); + _tensor_l_i0 = outQueue_tensor_l.DeQue< half >(); + + + // alp::set( l_block_out, alp::Zero ); + Duplicate( _tensor_l_i0, Zero, n2 ); //---- //TODO SET scalar + + + // DataCopy here + //*******************************// + + const uint32_t loopCount1 = n1 ; + for( uint32_t i1 = 0; i1 < n1; i1++ ) { + + _tensorSijIn = inQueue_tensor_S0.AllocTensor< half >(); + _tensorSijOut = outQueue_tensor_S1.AllocTensor< half >(); + + // alp::Tensor Sij( alp::Datatype::FP16, alp::make_axes( 2, 3 ) ); + + // alp::Tensor Temp( alp::Datatype::FP16, alp::make_axes( 2, 3 ) ); + + // alp::Tensor rowmaxS( alp::Datatype::FP16, alp::make_axes( 2 ) ); + + // alp::Tensor mi_old( alp::Datatype::FP16, alp::make_axes( 2 ) ); + + // alp::Tensor expmidiff( alp::Datatype::FP16, alp::make_axes( 2 ) ); + + + DataCopy( _tensorSijIn, _tensorS0_Gm[ i0*n1*n2*n3 + i1*n2*n3 ], n2*n3 ); + inQueue_tensor_S0.EnQue( _tensorSijIn ); + _tensorSijIn = inQueue_tensor_S0.DeQue< half >(); + + // +++++++++++++++++++++++++++++ // + // Online softmax + + // set( mi_old, m_block_out); + DataCopy( _tensor_Work4[mi_old], _tensor_m_i0, n2 ); + + // apply( rowmaxS, S_block_in, "max", make_axes( 3 ) ); + alp::BlockReduceMax( _tensor_Work4[rowmaxS], _tensorSijIn, _tensor_Work4[ ascend_el_per_blk ], n2, n3 ); + + // foldl( m_block_out, rowmaxS, "max" ); + Max( _tensor_m_i0, _tensor_m_i0, _tensor_Work4[rowmaxS], n2 ); + + // // apply( S_block_out, S_block_in, m_block_out, "minus", make_axes( 3 ) ); + alp::BlockBcastMinus( _tensorSijOut, _tensorSijIn, _tensor_m_i0, _tensor_Work4, n2, n3 ); + + // Si=np.exp(Si) + alp::BlockExp( _tensorSijOut, _tensorSijOut, n2, n3 ); + + // expmidiff=np.exp(mi_old-mtensor[i,:]) + Duplicate( _tensor_Work4[expmidiff], Zero, n2 ); //---- + Sub( _tensor_Work4[expmidiff], _tensor_Work4[mi_old], _tensor_m_i0, n2 ); + Exp( _tensor_Work4[expmidiff], _tensor_Work4[expmidiff], n2 ); + + // foldl( l_block_out, expmidiff, "times" ); + Mul( _tensor_l_i0, _tensor_l_i0, _tensor_Work4[expmidiff], n2 ); + + // foldl( l_block_out, S_block_out, "add", make_axes( 3 ) ); + alp::BlockReduceSum( _tensor_Work4[rowmaxS], _tensorSijOut, _tensor_Work4[ ascend_el_per_blk ] , n2, n3 ); + Add( _tensor_l_i0, _tensor_l_i0, _tensor_Work4[rowmaxS], n2 ); + + // +++++++++++++++++++++++++++++ // + + outQueue_tensor_S1.EnQue( _tensorSijOut ); + _tensorSijOut = outQueue_tensor_S1.DeQue< half >(); + DataCopy( _tensorS1_Gm[ i0*n1*n2*n3 + i1*n2*n3 ], _tensorSijOut, n2*n3 ); + + inQueue_tensor_S0.FreeTensor( _tensorSijIn ); + outQueue_tensor_S1.FreeTensor( _tensorSijOut ); + } + + // // Uptade ltensor + // // CopyOUT ltensor & mtensor + + DataCopy( _tensorOutm_Gm[ i0 * n2 ], _tensor_m_i0, n2 ); + DataCopy( _tensorOutl_Gm[ i0 * n2 ], _tensor_l_i0, n2 ); + + outQueue_tensor_m.FreeTensor( _tensor_m_i0 ); + outQueue_tensor_l.FreeTensor( _tensor_l_i0 ); + + } + + } + + +private: + + + private: + + uint32_t p0, p1, p2, p3; + uint32_t n0, n1, n2, n3; + uint32_t block_length_out1, tile_length_out1; + uint32_t block_length_out2, tile_length_out2; + + int32_t ascend_el_per_blk, totWorkSpaceSize; + int32_t rowmaxS, mi_old, expmidiff; + + TPipe pipe; + + // create queue for output, in this case depth is equal to buffer num + TQue outQueue_tensor_S1; + TQue inQueue_tensor_S0; + + TQue outQueue_tensor_m; + TQue outQueue_tensor_l; + + + GlobalTensor< half > _tensorOutm_Gm; + GlobalTensor< half > _tensorOutl_Gm; + GlobalTensor< half > _tensorS0_Gm; + GlobalTensor< half > _tensorS1_Gm; + + LocalTensor< half > _tensorSijOut; + LocalTensor< half > _tensorSijIn; + + LocalTensor< half > _tensor_m_i0; + LocalTensor< half > _tensor_l_i0; + + LocalTensor< half > _tensor_Work4; + + + + TBuf< QuePosition::VECCALC > tempBuf_alltensors; +}; + +extern "C" __global__ __aicore__ void custom_KernelOnlineSoftmax( + GM_ADDR out1, GM_ADDR out2, + GM_ADDR S0, GM_ADDR S1, + uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3 +) { + KernelOnlineSoftmax op(_p, _n0, _n1, _n2, _n3 ); + op.Init( + out1, out2, + S0, S1 + ); // TODO fix Init + op.Process(); // TODO fix Process +} + +#ifndef __CCE_KT_TEST__ +// call of kernel function +void custom_KernelOnlineSoftmax_do( + uint32_t blockDim, void* l2ctrl, void* stream, + uint8_t* out1, uint8_t* out2, + uint8_t* s0, uint8_t* s1, + uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2, uint32_t _n3 +) { + custom_KernelOnlineSoftmax<<< blockDim, l2ctrl, stream >>>( + out1, out2, + s0, s1, + _p, _n0, _n1, _n2, _n3 + ); +} +#endif diff --git a/examples/softmax_custom.cpp b/examples/softmax_custom.cpp new file mode 100644 index 000000000..fc17296be --- /dev/null +++ b/examples/softmax_custom.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ + +#include "kernel_operator.h" + +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue + +__aicore__ inline int32_t RoundUp(int32_t a, int32_t b) { + return (a + b - 1) / b; +} + +//template < typename T > +class KernelSoftmax { +public: + __aicore__ inline KernelSoftmax( const uint32_t _p0, const uint32_t _n0, const uint32_t _n1, const uint32_t _n2 ) { + p0 = _p0; + p1 = 1; + p2 = 1; + + n0 = _n0; + n1 = _n1; + n2 = _n2; + + block_length0 = ( n0 * n1 * n2 ) / ( p0 * p1 * p2 ); + tile_length0 = ( n1 * n2 ) / BUFFER_NUM; + + } + + __aicore__ inline void Init( GM_ADDR tensor0, GM_ADDR tensor1 ) + { + + // get start index for current core, core parallel + _tensor0_0Gm.SetGlobalBuffer( (__gm__ half *)tensor0 + block_length0 * GetBlockIdx(), block_length0); + _tensor1_0Gm.SetGlobalBuffer((__gm__ half *)tensor1 + block_length0 * GetBlockIdx(), block_length0); + + // Min workspace for reduction ops. + // Taking the largest btw MaxReduce and SumReduce (ie, MaxReduce) as specified in the AscendC manual + // at Secs. 8.1.5.10.1 and 8.1.5.10.3 + ascend_el_per_blk = ONE_BLK_SIZE / sizeof( half ); + int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / sizeof( half ); + int32_t firstMaxRepeat = n2 / elementsPerRepeat; + int32_t iter1OutputCount = firstMaxRepeat * 2; + int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk; + + totWorkSpaceSize = ( + ascend_el_per_blk + tmpBufsColsReduce // Output + workspace for Max/SumReduce + + n2 + ) * sizeof( half ); + + pipe.InitBuffer( inQueue_tensor0_0, BUFFER_NUM, tile_length0 * sizeof( half ) ); + pipe.InitBuffer( outQueue_tensor1_0, BUFFER_NUM, tile_length0 * sizeof( half ) ); + + } + + __aicore__ inline void Process() + { + // loop count ( including effect of using BUFFER_NUM ) + const uint32_t loopCount0 = ( n0 * BUFFER_NUM ) / p0; + for (uint32_t i0 = 0; i0 < loopCount0; i0++) { + uint32_t i = i0; + + CopyIn0(i); + + // This loop comes from axis 1, does not need data movement + // For now process a tile row by row + const uint32_t loopCount1 = n1 / BUFFER_NUM; + for( uint32_t i1 = 0; i1 < loopCount1 ; ++i1 ) { + Compute0( i1 ); + } + + // free input tensors for reuse + // inQueue_tensor0_0.FreeTensor( _tensor0Local ); + + CopyOut0(i); + } + } + + +private: + + __aicore__ inline void CopyIn0(uint32_t progress) + { + // alloc tensor from queue memory + _tensor0Local = inQueue_tensor0_0.AllocTensor< half >(); + // copy progress_th tile from global tensor to local tensor + DataCopy( _tensor0Local, _tensor0_0Gm[ progress * tile_length0 ], tile_length0 ); + // enque input tensors to VECIN queue + inQueue_tensor0_0.EnQue( _tensor0Local ); + + // deque input tensors from VECIN queue + _tensor0Local = inQueue_tensor0_0.DeQue< half >(); + _tensor1Local = outQueue_tensor1_0.AllocTensor< half >(); + + pipe.InitBuffer( tempBuf_tensor5_0, totWorkSpaceSize ); + _tensor5_0temp = tempBuf_tensor5_0.Get< half >( ); + pipe.InitBuffer( tempBuf_tensor6_0, totWorkSpaceSize ); + _tensor6_0temp = tempBuf_tensor6_0.Get< half >( ); + + pipe.InitBuffer( localBuf_tensor4_0, n1 ); + _tensor4_0Gm = localBuf_tensor4_0.Get< half >( ); // _tensor4_0Gm comes from API + + } + __aicore__ inline void Compute0(uint32_t _i1) + { + // apply( _tensor4_0Gm, S_block_in, "max", make_axes(2) ) + ReduceMax( _tensor5_0temp[ 0 ], _tensor0Local[ _i1 * n2 ], _tensor5_0temp[ ascend_el_per_blk ], n2, false ); + half max_ = _tensor5_0temp[ 0 ].GetValue( 0 ); + Duplicate( _tensor4_0Gm, max_, n2 ); // broadcast + + // apply( S_block_out, S_block_in, _tensor4_0Gm, "minus", make_axes(2) ); + Sub( _tensor1Local[ _i1 * n2 ], _tensor0Local[ _i1 * n2 ], _tensor4_0Gm, n2 ); + + // foldl( S_block_out, "exp" ); + Exp( _tensor1Local[ _i1 * n2 ], _tensor1Local[ _i1 * n2 ], n2 ); + + // apply( _tensor4_0Gm, S_block_out, "add", make_axes(2) ); + ReduceSum( _tensor6_0temp[ 0 ], _tensor1Local[ _i1 * n2 ], _tensor6_0temp[ ascend_el_per_blk ], n2 ); + half rec_sum_ = _tensor6_0temp[ 0 ].GetValue( 0 ); + Duplicate( _tensor4_0Gm, rec_sum_, n2 ); // broadcast + + // foldl( S_block_out, _tensor4_0Gm, "divide", make_axes(2) ); + Div( _tensor1Local[ _i1 * n2 ], _tensor1Local[ _i1 * n2 ], _tensor4_0Gm, n2 ); + + } + __aicore__ inline void CopyOut0(uint32_t progress) + { + outQueue_tensor1_0.EnQue< half >( _tensor1Local ); + // free input tensors for reuse + inQueue_tensor0_0.FreeTensor( _tensor0Local ); + + // deque output tensor from VECOUT queue + _tensor1Local = outQueue_tensor1_0.DeQue< half >(); + DataCopy( _tensor1_0Gm[ progress * tile_length0 ], _tensor1Local, tile_length0 ); + // free output tensor for reuse + outQueue_tensor1_0.FreeTensor( _tensor1Local ); + } + + private: + TPipe pipe; + // create queues for input, in this case depth is equal to buffer num + TQue inQueue_tensor0_0; + // create queue for output, in this case depth is equal to buffer num + TQue outQueue_tensor1_0; + + uint32_t p0, p1, p2, n0, n1, n2; + uint32_t block_length0, tile_length0; + int32_t ascend_el_per_blk, totWorkSpaceSize; + + GlobalTensor< half > _tensor0_0Gm, _tensor1_0Gm; + LocalTensor< half > _tensor0Local; + LocalTensor< half > _tensor1Local; + LocalTensor< half > _tensor5_0temp; + LocalTensor< half > _tensor6_0temp; + LocalTensor< half > _tensor4_0Gm; + + TBuf< QuePosition::VECCALC > tempBuf_tensor5_0; + TBuf< QuePosition::VECCALC > tempBuf_tensor6_0; + TBuf< QuePosition::VECCALC > localBuf_tensor4_0; + +}; + +extern "C" __global__ __aicore__ void custom_KernelSoftmax( + GM_ADDR in, GM_ADDR out, + uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 ) { + KernelSoftmax op(_p, _n0, _n1, _n2 ); + op.Init( in, out ); + op.Process(); +} + +#ifndef __CCE_KT_TEST__ +// call of kernel function +void custom_KernelSoftmax_do( uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* in, uint8_t* out, uint32_t _p, uint32_t _n0, uint32_t _n1, uint32_t _n2 ) +{ + custom_KernelSoftmax<<< blockDim, l2ctrl, stream >>>( in, out, _p, _n0, _n1, _n2 ); +} +#endif diff --git a/examples/softmax_custom_main.cpp b/examples/softmax_custom_main.cpp new file mode 120000 index 000000000..61df9c48e --- /dev/null +++ b/examples/softmax_custom_main.cpp @@ -0,0 +1 @@ +unittests/host_ascend_softmaxOp.cpp \ No newline at end of file diff --git a/examples/unittests/HOST_TEST_TEMPLATE.cpp b/examples/unittests/HOST_TEST_TEMPLATE.cpp new file mode 100644 index 000000000..cd38835cc --- /dev/null +++ b/examples/unittests/HOST_TEST_TEMPLATE.cpp @@ -0,0 +1,138 @@ + +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * This file constains code of cpu debug and npu code.We read data from bin file + * and write result to file. + */ +#include +#include +#include +#include +#include + +//#define _ANALYTIC_MODEL_ + +#ifdef _ANALYTIC_MODEL_ +#include "analytic_model.hpp" +#endif + +#include "data_utils.h" + +#ifdef __CCE_KT_TEST__ + +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void custom_##KERNELNAME##( +##CPUFRWDECTENSORALLLIST##, +##CPUFRWDECTHRDGRIDLIST##, +##CPUFRWDECTENSORSIZESLIST## +##ANALYTICMODELFORMALPARAMS## +); + +#else + +#include "acl/acl.h" +extern void custom_##KERNELNAME##_do( + uint32_t coreDim, void* l2ctrl, void* stream, +##FRWDECTENSORALLLIST##, +##FRWDECTHRDGRIDLIST##, +##FRWDECTENSORSIZESLIST## +##ANALYTICMODELFORMALPARAMS## +); + +#endif + + + +#define DTYPE uint16_t + +#define REPS ##REPEATS## + +int32_t main(int32_t argc, char* argv[]){ + int rc = 0; + uint32_t blockDim = ##NTHREADS##; + uint32_t _p0 = ##NTHREADS##; +##DECLARESIZES## + +##DECLARETENSORSIZES## + +##DECLAREANALYTICMODELPARAMS## + +#ifdef __CCE_KT_TEST__ +##CPUDECLARETENSOR## +##CPUREADFILES## + + AscendC::SetKernelMode(KernelMode::AIV_MODE); + ICPU_RUN_KF( + custom_##KERNELNAME##, + blockDim, +##CPUTENSORLIST##, + blockDim, +##ALLDIMENSIONSLIST##, ##ANALYTICMODELPARAMS## + ); // run the Kernel + +##CPUWRITETENSOR## + +##CPUFREETENSOR## +#else + + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = ##DEVICEID##; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + +##HOSTDECLARETENSOR## +##HOSTREADFILES## +##DEVICEDECLARETENSOR## + + std::vector< double > meas_vec( REPS ); + + for ( auto i = 0; i < REPS; ++i ) { +##HOST2DEVICEMOVE## + std::cout << "custom_##KERNELNAME## rep " << i << std::endl; + auto begin = std::chrono::high_resolution_clock::now(); + custom_##KERNELNAME##_do( + blockDim, nullptr, stream, +##DEVICETENSORLIST##, + blockDim, +##ALLDIMENSIONSLIST##, ##ANALYTICMODELPARAMS## + ); + rc = aclrtSynchronizeStream(stream); + CHECK_ACL(rc); + if( rc != 0 ) { + break; + } + auto end = std::chrono::high_resolution_clock::now(); + meas_vec[ i ] = static_cast< double >( std::chrono::duration_cast(end-begin).count() ); + } + + std::sort( meas_vec.begin(), meas_vec.end() ); + auto avg = std::accumulate( meas_vec.cbegin(), meas_vec.cend(), 0. ) / meas_vec.size(); + auto min = *( std::min_element( meas_vec.cbegin(), meas_vec.cend() ) ); + auto max = *( std::max_element( meas_vec.cbegin(), meas_vec.cend() ) ); + auto size = meas_vec.size(); + auto med = ( size % 2 == 0 ) ? ( meas_vec[ size / 2 - 1 ] + meas_vec[ size / 2 ] ) / 2 : meas_vec[ size / 2 ]; + std::cout << "Measured Time (avg, ms): " << avg * 1e-6 << std::endl; + std::cout << " (min, ms): " << min * 1e-6 << std::endl; + std::cout << " (max, ms): " << max * 1e-6 << std::endl; + std::cout << " (med, ms): " << med * 1e-6 << std::endl; + +##DEVICE2HOSTMOVE## +##DEVICEFREETENSOR## +##WRITETENSOR## +##HOSTFREETENSOR## + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + if( rc != 0 ) { + return 1; + } else { + return 0; + } + +} diff --git a/examples/unittests/Makefile b/examples/unittests/Makefile new file mode 100644 index 000000000..294c7f2a6 --- /dev/null +++ b/examples/unittests/Makefile @@ -0,0 +1,56 @@ +ASCEND_TOOLKIT_INSTALL_PATH=$(ASCEND_HOME_PATH) + +CXX=/usr/bin/c++ +ccec_compiler=$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/ccec_compiler/bin/ccec + +ifeq ($(ASCEND_VERSION),910A) + ccec_falgs=-I$(ALP_ROOT)/include/graphblas/ascend/ -I$(ALP_ROOT)/include/asclib -xcce -DTILING_KEY_VAR=0 -I$(ASCEND_TOOLKIT_INSTALL_PATH)/acllib/include -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw/impl -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw/interface -I$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/tikicpulib/lib/include -O2 -std=c++17 --cce-aicore-arch=dav-c100 --cce-auto-sync -fPIC -pthread + ccec_link_falgs=--cce-fatobj-link --cce-aicore-arch=dav-c100 -L$(ASCEND_TOOLKIT_INSTALL_PATH)/runtime/lib64 -L$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/simulator/Ascend910A/lib -L$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/tikicpulib/lib/Ascend910A -lstdc++ -lruntime -lascendcl -lm +endif + +ifeq ($(ASCEND_VERSION),910B) + ccec_falgs=-I$(ALP_ROOT)/include/graphblas/ascend/ -I$(ALP_ROOT)/include/asclib -xcce -DTILING_KEY_VAR=0 -I$(ASCEND_TOOLKIT_INSTALL_PATH)/acllib/include -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw/impl -I$(ASCEND_TOOLKIT_INSTALL_PATH)/compiler/tikcpp/tikcfw/interface -I$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/tikicpulib/lib/include -O2 -std=c++17 --cce-aicore-arch=dav-c220-vec -mllvm -cce-aicore-function-stack-size=16000 -mllvm -cce-aicore-record-overflow=false -mllvm -cce-aicore-addr-transform --cce-auto-sync -fPIC -pthread -DASCEND910B + ccec_link_falgs=--cce-fatobj-link --cce-aicore-arch=dav-c220-vec -L$(ASCEND_TOOLKIT_INSTALL_PATH)/runtime/lib64 -L$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/simulator/Ascend910B1/lib -L$(ASCEND_TOOLKIT_INSTALL_PATH)/tools/tikicpulib/lib/Ascend910B1 -lstdc++ -lruntime -lascendcl -lm -DASCEND910B +endif + +MODE=npu + +ifeq ($(ASCEND_CPU_MODE),ON) +ifeq ($(ASCEND_VERSION),910A) +MODE=cpu +ccec_compiler=$(CXX) + +ccec_falgs=-I$(ALP_ROOT)/include/graphblas/ascend/ -I$(ALP_ROOT)/include/asclib -D__CCE_AICORE__=100 -D__CCE_KT_TEST__=1 -D__DAV_C100__ -I${ASCEND_HOME_PATH}/acllib/include -isystem ${ASCEND_HOME_PATH}/tools/tikicpulib/lib/include -isystem ${ASCEND_HOME_PATH}/tools/tikicpulib/../../compiler/tikcpp/tikcfw -isystem ${ASCEND_HOME_PATH}/tools/tikicpulib/../../compiler/tikcpp/tikcfw/impl -isystem ${ASCEND_HOME_PATH}/tools/tikicpulib/../../compiler/tikcpp/tikcfw/interface -std=gnu++1z -g -std=c++17 + +ccec_link_falgs=-L${ASCEND_HOME_PATH}/tools/tikicpulib/lib -L${ASCEND_HOME_PATH}/tools/tikicpulib/lib/Ascend910A -L${ASCEND_HOME_PATH}/tools/tikicpulib/../simulator/Ascend910A/lib -L${ASCEND_HOME_PATH}/tools/tikicpulib/../../lib64 -Wl,-rpath,${ASCEND_HOME_PATH}/tools/tikicpulib/lib:${ASCEND_HOME_PATH}/tools/tikicpulib/lib/Ascend910A:${ASCEND_HOME_PATH}/tools/tikicpulib/../simulator/Ascend910A/lib:${ASCEND_HOME_PATH}/tools/tikicpulib/../../lib64 -lascendcl -Wl,--no-as-needed -l_pvmodel -ltikcpp_debug ${ASCEND_HOME_PATH}/tools/tikicpulib/lib/libtikicpulib_cceprint.so ${ASCEND_HOME_PATH}/tools/tikicpulib/lib/libtikicpulib_npuchk.so ${ASCEND_HOME_PATH}/tools/tikicpulib/lib/libtikicpulib_stubreg.so -Wl,--as-needed + +endif +endif + +$(hostfile): $(target_cmake) $(host_template) $(host_code_inp) $(generate_host) + @echo python3 $(generate_host) $(host_template) $(hostfile) $(host_code_inp) 10 0 8 + @python3 $(generate_host) $(host_template) $(hostfile) $(host_code_inp) 10 0 8 + +$(host_code_inp): $(target_cmake) + @echo @$(target_cmake) + @$(target_cmake) + +$(devicefile): $(target_cmake) + @$(target_cmake) + +$(basename $(notdir $(hostfile))).$(MODE).o : $(hostfile) + @echo $(ccec_compiler) $(ccec_falgs) -c $(hostfile) -o $(basename $(notdir $(hostfile))).$(MODE).o + $(ccec_compiler) $(ccec_falgs) -c $(hostfile) -o $(basename $(notdir $(hostfile))).$(MODE).o + +$(basename $(notdir $(devicefile))).$(MODE).o : $(devicefile) + @echo $(ccec_compiler) $(ccec_falgs) -c $(devicefile) -o $(basename $(notdir $(devicefile))).$(MODE).o + $(ccec_compiler) $(ccec_falgs) -c $(devicefile) -o $(basename $(notdir $(devicefile))).$(MODE).o + +$(target): $(basename $(notdir $(devicefile))).$(MODE).o $(basename $(notdir $(hostfile))).$(MODE).o + $(ccec_compiler) $(basename $(notdir $(devicefile))).$(MODE).o $(basename $(notdir $(hostfile))).$(MODE).o -o $(target) $(ccec_link_falgs) + +clean: + @rm -f $(basename $(notdir $(devicefile))).$(MODE).o $(basename $(notdir $(hostfile))).$(MODE).o $(target) + +cleanall: + @rm -f .*o diff --git a/examples/unittests/alp_ascend_addOp.cpp b/examples/unittests/alp_ascend_addOp.cpp new file mode 100644 index 000000000..32d13a793 --- /dev/null +++ b/examples/unittests/alp_ascend_addOp.cpp @@ -0,0 +1,83 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +using namespace alp; + +template < typename GridType > +void ascend_code( const GridType &grid, RC&rc ) { + rc = RC::FAILED; + + Tensor x_global(Datatype::FP16, make_axes( "i", "j" ) ); // 0 is default + Tensor y_global(Datatype::FP16, make_axes( "i", "j" ) ); + Tensor z_global(Datatype::FP16, make_axes( "i", "j" ) ); + + rc = grid.forEach( make_axes( "i" ), [ & ] () { + auto x_block = getView( x_global ); + auto y_block = getView( y_global ); + auto z_block = getView( z_global ); + + // add( z_block, x_block, y_block, make_axes( 1 ) ); // z = x + y + z_block( "j" ) = add( x_block( "j" ), y_block( "j" ), "j" ); // z = x + y + + store( z_block ); + } ); + + return; +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + RC error_code = RC::SUCCESS; + try { + error_code = alp::compile< 1, 2 >( ascend_code, "addOp" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/unittests/alp_ascend_addOpv1.cpp b/examples/unittests/alp_ascend_addOpv1.cpp new file mode 100644 index 000000000..3efbf2308 --- /dev/null +++ b/examples/unittests/alp_ascend_addOpv1.cpp @@ -0,0 +1,83 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +using namespace alp; + +template < typename GridType > +void ascend_code( const GridType &grid, RC&rc ) { + rc = RC::FAILED; + + Tensor x_global(Datatype::FP16, make_axes( "i" ) ); // 0 is default + Tensor y_global(Datatype::FP16, make_axes( "i" ) ); + Tensor z_global(Datatype::FP16, make_axes( "i" ) ); + + rc = grid.forEach( make_axes( "i" ), [ & ] () { + auto x_block = getView( x_global ); + auto y_block = getView( y_global ); + auto z_block = getView( z_global ); + + apply( z_block, x_block, y_block, "add" ); // z = x + y +// z_block( "j" ) = add( x_block( "j" ), y_block( "j" ), "j" ); // z = x + y + + store( z_block ); + } ); + + return; +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + RC error_code = RC::SUCCESS; + try { + error_code = alp::compile< 1, 1 >( ascend_code, "addOpv1" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/unittests/alp_ascend_movedataOpv01.cpp b/examples/unittests/alp_ascend_movedataOpv01.cpp new file mode 100644 index 000000000..4bf21e900 --- /dev/null +++ b/examples/unittests/alp_ascend_movedataOpv01.cpp @@ -0,0 +1,90 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#define DEBUG + +#include + +using namespace alp; + + +// alp::Grid< 1, 3 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +template < typename GridType > +void ascend_code( const GridType &grid, RC &rc ) { // max shape = ( Tr, Br, d ) + rc = RC::FAILED; + + Tensor Sin(Datatype::FP16, make_axes( "i", "j", "k" ) ); + Tensor Sout(Datatype::FP16, make_axes( "i", "j", "k" ) ); + + rc = grid.forEach( make_axes( "i" ), [ & ] () { + + auto S_block_in = getView( Sin ); + auto S_block_out = getView( Sout ); + + Tensor localTensor(Datatype::FP16, make_axes( "j", "k" ) ); + + set( localTensor, S_block_in); + set( S_block_out, localTensor); + + store( S_block_out ); + + } ); +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + RC error_code = RC::SUCCESS; + try { + error_code = alp::compile< 1, 3 >( ascend_code, "movedataOpv01" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/unittests/alp_ascend_onlinesoftmaxOp.cpp b/examples/unittests/alp_ascend_onlinesoftmaxOp.cpp new file mode 100644 index 000000000..0f33d0fd3 --- /dev/null +++ b/examples/unittests/alp_ascend_onlinesoftmaxOp.cpp @@ -0,0 +1,173 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#define DEBUG + +#include + +using namespace alp; + + +// alp::Grid< 1, 4 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +template < typename GridType > +void ascend_code( const GridType &grid, RC &rc ) { + + // max shape = ( Tr,Tc,Br,Bc ) + // Tr = number for row-blocks, Br = row-length of rowblocks; Tr*Tc = N + // Tc = number for column-blocks, Bc = column-length of rowblocks; Tr*Tc = M + // for softmax N == M, i.e. Sin and Sout are square matrices + rc = alp::RC::FAILED; + + Tensor mtensorout( alp::Datatype::FP16, make_axes( 0, 2 ) ); // shape = ( Tr,Br ) + Tensor ltensorout( alp::Datatype::FP16, make_axes( 0, 2 ) ); // shape = ( Tr,Br )clear + Tensor Sin( alp::Datatype::FP16, make_axes( 0, 1, 2, 3 ) ); // shape = ( Tr,Tc,Br,Bc ) + Tensor Sout( alp::Datatype::FP16, make_axes( 0, 1, 2, 3 ) ); // shape = ( Tr,Tc,Br,Bc ) + + rc = grid.forEach( make_axes( 0 ), [ & ] () { + + auto m_block_out = getView( mtensorout ); + auto l_block_out = getView( ltensorout ); + +//--> + set( m_block_out, -alp::Infinity< double > ); // TODO the double should re replaced by alp::Datatype::FP16 +//--> + set( l_block_out, alp::Zero< double > ); //TODO + + grid.forEach( + make_axes( 1 ), // prallel loop- > for(i0=0; i0 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + alp::RC error_code = alp::RC::SUCCESS; + try { + error_code = alp::compile< 1, 4 >( ascend_code, "onlinesoftmaxOp" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != alp::RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/unittests/alp_ascend_softmaxOp.cpp b/examples/unittests/alp_ascend_softmaxOp.cpp new file mode 100644 index 000000000..51d55b953 --- /dev/null +++ b/examples/unittests/alp_ascend_softmaxOp.cpp @@ -0,0 +1,115 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#define DEBUG + +#include + +using namespace alp; + + +// alp::Grid< 1, 3 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +template < typename GridType > +void ascend_code( const GridType &grid, RC &rc ) { // max shape = ( Tr, Br, d ) + rc = RC::FAILED; + + Tensor Sin(Datatype::FP16, make_axes( "i", "j", "k" ) ); // shape = (Tr, Br, d) + Tensor Sout(Datatype::FP16, make_axes( "i", "j", "k" ) ); // shape = (Tr, Br, d) + // Tensor of rank R, has R strides, defined in order to iterate + // the memory container. i.e. S with shape = (Tr, Br, d) + // element (i,j,k) is located in i*(Br*d) + j*(d) + k position + // (this is not the only mapping indices -> memory location) + // These basic (Tr, Br, d) stride has to be inherited by any view created from that + // container, in order to be able to properly iterate the memory container. + + + // forEach cuts the grid into small pieces that are processed concurrently + rc = grid.forEach( make_axes( "i" ), [ &Sin, &Sout ] () { + + auto S_block_in = getView( Sin ); // S_block_in allocate in UB ts0 x n1 x n2 + auto S_block_out = getView( Sout ); // S_block_out allocate in UB ts0 x n1 x n2 + + Tensor localTensor(Datatype::FP16, make_axes( "j" ) ); // localTensor allocate in UB ts0 x n1 + + // T(1) T(1,2) + // apply( localTensor, S_block_in, "max", make_axes( "k" ) ); // asc::max( localTensor, S_block_in, "2" ) + localTensor( "j" ) = max( S_block_in("j", "k" ), "k" ); + + // T(1,2) T(1,2) + // apply( S_block_out, S_block_in, localTensor, "minus", make_axes( "k" ) ); + S_block_out( "j", "k" ) = minus( S_block_in("j", "k" ), localTensor( "j" ) , "k" ); + + // T(1,2) + // apply( S_block_out, S_block_out, "exp", make_axes( "k" ) ); + foldl( S_block_out, "exp" ); + + // T(1) T(1,2) + // apply( localTensor, S_block_out, "add", make_axes( "k" ) ); + localTensor( "j" ) = add( S_block_out("j", "k"), "k" ); + + // T(1,2) T(1) + // apply( S_block_out, S_block_out, localTensor, "divide", make_axes( "k" ) ); + foldl( S_block_out, localTensor, "divide", make_axes( "k" ) ); + + store( S_block_out ); + + } ); +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + RC error_code = RC::SUCCESS; + try { + error_code = alp::compile< 1, 3 >( ascend_code, "softmaxOp" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/unittests/alp_ascend_softmaxOpv1.cpp b/examples/unittests/alp_ascend_softmaxOpv1.cpp new file mode 100644 index 000000000..f079ad986 --- /dev/null +++ b/examples/unittests/alp_ascend_softmaxOpv1.cpp @@ -0,0 +1,112 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#define DEBUG + +#include + +using namespace alp; + + +// alp::Grid< 1, 3 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +template < typename GridType > +void ascend_code( const GridType &grid, RC &rc ) { // max shape = ( Tr, Br, d ) + rc = alp::RC::FAILED; + + Tensor Sin( alp::Datatype::FP16, make_axes( "i", "j", "k" ) ); // shape = (Tr, Br, d) + Tensor Sout( alp::Datatype::FP16, make_axes( "i", "j", "k" ) ); // shape = (Tr, Br, d) + + rc = grid.forEach( make_axes( "i" ), [ & ] () { + + rc = grid.forEach( make_axes( "j" ), [ & ] () { + + auto S_block_in = getView( Sin ); // T(2) + auto S_block_out = getView( Sout ); // T(2) + Tensor localTensor( alp::Datatype::FP16, make_axes( ) ); // T() + //Scalar localTensor( alp::Datatype::FP16 ); + + // T() T(2) -> ReduceMax( A, B, n2 ) + // apply( localTensor, S_block_in, "max", make_axes( "k" ) ); + localTensor( "j" ) = max( S_block_in("j", "k" ), "k" ); + + // T(2) T(2) T() -> BcastMinus( A, B, C, n2 ) + // apply( S_block_out, S_block_in, localTensor, "minus", make_axes( "k" ) ); + S_block_out( "j", "k" ) = minus( S_block_in("j", "k" ), localTensor( "j" ) , "k" ); + + // T(2) -> InplaceExp( A, n2 ) + foldl( S_block_out, "exp" ); + + // T() T(2) -> ReduceAdd( A, B, n2 ) + // apply( localTensor, S_block_out, "add", make_axes( "k" ) ); + localTensor( "j" ) = add( S_block_out("j", "k"), "k" ); + + // T(2) T() -> BcastDivide( A, B, n2 ) + foldl( S_block_out, localTensor, "divide", make_axes( "k" ) ); + + // T(2) + store( S_block_out ); + + } ); + + } ); + + return; +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + alp::RC error_code = alp::RC::SUCCESS; + try { + error_code = alp::compile< 1, 3 >( ascend_code, "softmaxOpv1" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != alp::RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/unittests/alp_ascend_softmaxOpv3.cpp b/examples/unittests/alp_ascend_softmaxOpv3.cpp new file mode 100644 index 000000000..c417e5406 --- /dev/null +++ b/examples/unittests/alp_ascend_softmaxOpv3.cpp @@ -0,0 +1,111 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#define DEBUG + +#include + +using namespace alp; + + +// alp::Grid< 1, 4 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +template < typename GridType > +void ascend_code( const GridType &grid, RC &rc ) { + rc = alp::RC::FAILED; + + Tensor Sin( alp::Datatype::FP16, make_axes( "i", "j", "k", "l" ) ); + Tensor Sout( alp::Datatype::FP16, make_axes( "i", "j", "k", "l" ) ); + + rc = grid.forEach( make_axes( "i" ), [ & ] () { + + rc = grid.forEach( make_axes( "j" ), [ & ] () { + + auto S_block_in = getView( Sin ); // T(2,3) + auto S_block_out = getView( Sout ); // T(2,3) + Tensor localTensor( alp::Datatype::FP16, make_axes( "k" ) ); // T(2) + + // T(2) T(2,3) + // apply( localTensor, S_block_in, "max", make_axes( "l" ) ); + localTensor( "k" ) = max( S_block_in("k", "l" ), "l" ); + + // T(2,3) T(2,3) T(2) + // apply( S_block_out, S_block_in, localTensor, "minus", make_axes( "l" ) ); + S_block_out( "k", "l" ) = minus( S_block_in("k", "l" ), localTensor( "k" ) , "l" ); + + // T(2,3) + foldl( S_block_out, "exp" ); + + // T(2) T(2,3) + // apply( localTensor, S_block_out, "add", make_axes( "l" ) ); + localTensor( "k" ) = add( S_block_out("k", "l" ), "l" ); + + // T(2,3) T(2) + foldl( S_block_out, localTensor, "divide", make_axes( "k" ) ); + + // T(2,3) + store( S_block_out ); + + } ); + + } ); + + return; +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + alp::RC error_code = alp::RC::SUCCESS; + try { + error_code = alp::compile< 1, 4 >( ascend_code, "softmaxOpv3" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != alp::RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << alp::toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/unittests/alp_ascend_softmaxOpv4.cpp b/examples/unittests/alp_ascend_softmaxOpv4.cpp new file mode 100644 index 000000000..3b1283b87 --- /dev/null +++ b/examples/unittests/alp_ascend_softmaxOpv4.cpp @@ -0,0 +1,114 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#define DEBUG + +#include + +using namespace alp; + + +// alp::Grid< 1, 6 > note: +// - Thread dimensionality = 1, means that the 1D thread grid maps to first +// axis of the problem grid. A refinement of this API may make this +// configurable. +template < typename GridType > +void ascend_code( const GridType &grid, RC &rc ) { + rc = RC::FAILED; + + Tensor Sin(Datatype::FP16, make_axes( "i", "j", "k", "l", "m", "n" ) ); + Tensor Sout(Datatype::FP16, make_axes( "i", "j", "k", "l", "m", "n" ) ); + + rc = grid.forEach( make_axes( "i" ), [ & ] () { + + rc = grid.forEach( make_axes( "j" ), [ & ] () { + + rc = grid.forEach( make_axes( "l" ), [ & ] () { + + rc = grid.forEach( make_axes( "m" ), [ & ] () { + + auto S_block_in = getView( Sin ); // T(2,5) + auto S_block_out = getView( Sout ); // T(2,5) + Tensor localTensor(Datatype::FP16, make_axes( "k" ) ); // T(2) + + // T(2) T(2,5) + // apply( localTensor, S_block_in, "max", make_axes( "n" ) ); + localTensor( "k" ) = max( S_block_in("k", "n" ), "n" ); + + // T(2,5) T(2,5) T(2) + // apply( S_block_out, S_block_in, localTensor, "minus", make_axes( "n" ) ); + S_block_out( "k", "n" ) = minus( S_block_in("k", "n" ), localTensor( "k" ) , "n" ); + + // T(2,5) + foldl( S_block_out, "exp" ); + + // T(2) T(2,5) + // apply( localTensor, S_block_out, "add", make_axes( "n" ) ); + localTensor( "k" ) = add( S_block_out("k", "n" ), "n" ); + + // T(2,5) T(2) + foldl( S_block_out, localTensor, "divide", make_axes( "n" ) ); + + // T(2,5) + store( S_block_out ); + + } ); + } ); + } ); + } ); +} + +int main( int argc, char ** argv ) { + + // default options + bool printUsage = false; + + // input error checking + if( argc > 1 ) { + printUsage = true; + } + + // print help on error + if( printUsage ) { + std::cerr << "Usage: " << argv[ 0 ] << "\n"; + return 10; + } + + // start opgen + std::cout << "//This is AscendOpGen example " << argv[ 0 ] << "\n"; + RC error_code = RC::SUCCESS; + try { + error_code = alp::compile< 1, 6 >( ascend_code, "softmaxOpv4" ); + } catch( std::exception &e ) { + std::cerr << "alp::compile threw error: " << e.what() << "\n"; + return 20; + } + if( error_code != RC::SUCCESS ) { + std::cerr << std::flush; + std::cout << "Codegen FAILED (" << toString( error_code ) << ")" + << std::endl; + return 30; + } else { + std::cout << "//Codegen OK" << std::endl; + return 0; + } + +} + diff --git a/examples/unittests/check_data_movedataOp-v01.py b/examples/unittests/check_data_movedataOp-v01.py new file mode 100644 index 000000000..863e52d40 --- /dev/null +++ b/examples/unittests/check_data_movedataOp-v01.py @@ -0,0 +1,83 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import copy +import glob +import re +import sys + +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + +def check_golden_data(): + check = True + tol = 1.e-2 + + goldenfilename = "./output/golden.bin" + + outfiles=glob.glob("./output/param1.bin") + # sort outfiles + if(len(outfiles)>1): + ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ] + outfiles=np.array(outfiles)[np.argsort(ii)] + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + + M = n0*n1 + N = n2 + + dtype = np.float16 + print("(N,M)=",N,M) + golden = np.fromfile( goldenfilename, dtype=dtype ) + + # print(f"Golden: {golden[:10]}") + # print(f"Output: {output[:10]}") + + print("Golden:") + reshaped_golden = np.reshape(golden, (M, N) ) + for pos, row in enumerate( reshaped_golden[[0,1,-2,-1]] ): + print(f"{pos}: {row}") + + + for outfilename in outfiles: + + output = np.fromfile( outfilename, dtype=dtype ) + + + print("Output:",outfilename) + reshaped_output = np.reshape(output, (M, N) ) + for pos, row in enumerate( reshaped_output[[0,1,-2,-1],:20] ): + print(f"{pos}: {row}") + + # diff = (golden.astype(float) - output.astype(float))**2 + # diff = np.cumsum((diff.flatten())) + # print("Diff**2:") + # reshaped_output = np.reshape(diff, (M, N) ) + # for pos, row in enumerate( reshaped_output[[0,1,-2,-1]] ): + # print(f"{pos}: {row}") + + norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) ) + norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) ) + print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ") + check = check and (norm_diff_relative1): + ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ] + outfiles=np.array(outfiles)[np.argsort(ii)] + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + + M = n0*n1 + N = n2 + + dtype = np.float16 + print("(N,M)=",N,M) + golden = np.fromfile( goldenfilename, dtype=dtype ) + + # print(f"Golden: {golden[:10]}") + # print(f"Output: {output[:10]}") + + print("Golden:") + reshaped_golden = np.reshape(golden, (M, N) ) + for pos, row in enumerate( reshaped_golden[[0,1,-2,-1]] ): + print(f"{pos}: {row}") + + + for outfilename in outfiles: + + output = np.fromfile( outfilename, dtype=dtype ) + + + print("Output:",outfilename) + reshaped_output = np.reshape(output, (M, N) ) + for pos, row in enumerate( reshaped_output[[0,1,-2,-1],:20] ): + print(f"{pos}: {row}") + + # diff = (golden.astype(float) - output.astype(float))**2 + # diff = np.cumsum((diff.flatten())) + # print("Diff**2:") + # reshaped_output = np.reshape(diff, (M, N) ) + # for pos, row in enumerate( reshaped_output[[0,1,-2,-1]] ): + # print(f"{pos}: {row}") + + norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) ) + norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) ) + print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ") + check = check and (norm_diff_relative1): + ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ] + outfiles=np.array(outfiles)[np.argsort(ii)] + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + n3=int(sys.argv[4]) + + M = n0*n1 + N = n2*n3 + shape1=(n0,n1,n2,n3) + + dtype = np.float16 + + printblocks=[(0,0),(0,n1-1),(n0-1,n1-1),(n0-1,0)] + + golden = np.fromfile( goldenfilename, dtype=dtype ) + + # print(f"Golden: {golden[:10]}") + # print(f"Output: {output[:10]}") + + + print("Golden:") + reshaped_golden = np.reshape(golden, shape1 ) + # for i,j in printblocks: + # print("i=",i," j=",j) + # for i2 in range(n2): + # print(reshaped_golden[i,j,i2,:]) + + + for outfilename in outfiles: + + output = np.fromfile( outfilename, dtype=dtype ) + print("Output:",outfilename) + reshaped_output = np.reshape(output, shape1 ) + # for i,j in printblocks: + # print("i=",i," j=",j) + # for i2 in range(n2): + # print(reshaped_output[i,j,i2,:]) + + # diff = (golden.astype(float) - output.astype(float))**2 + # diff = np.cumsum((diff.flatten())) + # print("Diff**2:") + # reshaped_output = np.reshape(diff, (M, N) ) + # for pos, row in enumerate( reshaped_output[[0,1,-2,-1]] ): + # print(f"{pos}: {row}") + + norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) ) + norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) ) + print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ") + # for i0 in range(n0): + # for i1 in range(n1): + # for i2 in range(n2): + # rownorm=np.linalg.norm( reshaped_output[i0,i1,i2,:]-reshaped_golden[i0,i1,i2,:]) + # #print(i0,i1,i2,rownorm) + # if(rownorm>1.e-10): + # print(i0,i1,i2,rownorm) + # print(" output=",reshaped_output[i0,i1,i2,:]) + # print(" golden=",reshaped_golden[i0,i1,i2,:]) + check = check and (norm_diff_relative1): + ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ] + outfiles=np.array(outfiles)[np.argsort(ii)] + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + n3=int(sys.argv[4]) + n4=int(sys.argv[5]) + n5=int(sys.argv[6]) + + N = n0*n1*n2*n3*n4*n5 + shape1=(n0,n1,n2,n3,n4,n5) + + dtype = np.float16 + + # printblocks=[(0,0),(0,n1-1),(n0-1,n1-1),(n0-1,0)] + + golden = np.fromfile( goldenfilename, dtype=dtype ) + + # print(f"Golden: {golden[:10]}") + # print(f"Output: {output[:10]}") + + + print("Golden:") + reshaped_golden = np.reshape(golden, shape1 ) + + for outfilename in outfiles: + + output = np.fromfile( outfilename, dtype=dtype ) + print("Output:",outfilename) + reshaped_output = np.reshape(output, shape1 ) + + + # for i0 in range(n0): + # for i1 in range(n1): + # for i2 in range(n2): + # for i3 in range(n3): + # for i4 in range(n4): + # tmp_diff=np.linalg.norm(reshaped_output[i0,i1,i2,i3,i4,:]-reshaped_golden[i0,i1,i2,i3,i4,:]) + # print(i0,i1,i2,i3,i4, + # " d=",tmp_diff, + # " o=",reshaped_output[i0,i1,i2,i3,i4,:2],".", + # " g=",reshaped_golden[i0,i1,i2,i3,i4,:2],".") + + + norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) ) + norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) ) + print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ") + check = check and (norm_diff_relative1): + ii=[ int(re.search(r'\d+', fname).group()) for fname in outfiles ] + outfiles=np.array(outfiles)[np.argsort(ii)] + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + + M = n0*n1 + N = n2 + + dtype = np.float16 + print("(N,M)=",N,M) + golden = np.fromfile( goldenfilename, dtype=dtype ) + + # print(f"Golden: {golden[:10]}") + # print(f"Output: {output[:10]}") + + print("Golden:") + reshaped_golden = np.reshape(golden, (M, N) ) + for pos, row in enumerate( reshaped_golden[[0,1,-2,-1]] ): + print(f"{pos}: {row}") + + + for outfilename in outfiles: + + output = np.fromfile( outfilename, dtype=dtype ) + + + print("Output:",outfilename) + reshaped_output = np.reshape(output, (M, N) ) + for pos, row in enumerate( reshaped_output[[0,1,-2,-1],:20] ): + print(f"{pos}: {row}") + + # diff = (golden.astype(float) - output.astype(float))**2 + # diff = np.cumsum((diff.flatten())) + # print("Diff**2:") + # reshaped_output = np.reshape(diff, (M, N) ) + # for pos, row in enumerate( reshaped_output[[0,1,-2,-1]] ): + # print(f"{pos}: {row}") + + norm_diff = np.linalg.norm( (golden.astype(float) - output.astype(float)) ) + norm_diff_relative=norm_diff/np.linalg.norm( golden.astype(float) ) + print(f"Comparing in {goldenfilename} and {outfilename} (size {np.shape(golden)} and type {dtype}): absolute = {norm_diff} : relative = {norm_diff_relative} ") + check = check and (norm_diff_relative /dev/null && pwd ) +ALP_ROOT=$( realpath $SCRIPT_DIR/../../ ) +CWD=$(pwd) +KERNELNAME=$TNAME +if [[ "$ASCEND_CPU_MODE" == "ON" ]] ; then MODE="cpu"; else MODE="npu"; fi +TARGET=${TNAME}_${MODE} +TARGET_cmake=alp_ascend_${TNAME}_ascend +mkdir -p $CWD/src +hostfile_default="$CWD/src/host_ascend_${TNAME}.cpp" +[ -z ${HOST_TEST_TEMPLATE} ] && HOST_TEST_TEMPLATE=$(pwd)/HOST_TEST_TEMPLATE.cpp +HOST_CODE_INP=$CWD/src/generate_host_code_${TNAME}.inp + +bashargn=$# + +[ -z ${ASCEND_VERSION} ] && echo "ASCEND_VERSION not set" && exit 1 +! [[ "$ASCEND_VERSION" =~ (910A|910B) ]] && echo "ASCEND_VERSION possible values: 910A 910B" && exit 1 +[ -z ${ALP_ROOT} ] && echo "ALP_ROOT not set" && exit 1 +[ -z ${KERNELNAME} ] && echo "KERNELNAME not set" && exit 1 +[ -z ${TARGET} ] && echo "TARGET not set" && exit 1 +[ -z ${TARGET_cmake} ] && echo "TARGET_cmake not set" && exit 1 +[ -z ${CWD} ] && echo "CWD not set" && exit 1 +[ -z ${HOST_TEST_TEMPLATE} ] && echo "HOST_TEST_TEMPLATE not set" && exit 1 +[ -z ${HOST_CODE_INP} ] && echo "HOST_CODE_INP not set" && exit 1 +if [ -z ${ASCEND_HOME_PATH} ] +then + trydir01="/usr/local/Ascend/ascend-toolkit/latest" + if [ -d "$trydir01" ] + then + ASCEND_HOME_PATH="$trydir01" + fi +fi +[ -z ${ASCEND_HOME_PATH} ] && echo "ASCEND_HOME_PATH not set" && exit 1 + + +if [[ "$bashargn" == 2 ]] +then + #use provided code + opfile=$1 + hostfile=$2 + + opfile=$(realpath $opfile) + hostfile=$(realpath $hostfile) + echo "opfile=$opfile" + echo "hostfile=$hostfile" +else + #generate the code + mkdir -p src + opfile="src/${KERNELNAME}_npu_op.cpp" + hostfile="$hostfile_default" + + #cleanup any previous output + mkdir -p bin + #build ALP code gnerator, i.e. ascend_softmaxOp_ascend executable + if [ -z "$BUILD_DIR" ] + then + echo "BUILD_DIR is not set, create tmp BUILD_DIR in /tmp/build_alp/"; + rm -rf /tmp/build_alp/ + mkdir /tmp/build_alp/ && cd /tmp/build_alp/ && cmake $ALP_ROOT && make -j$(nproc) $TARGET_cmake && cd $CWD || { echo "codegen build failed" && exit 1; } + BUILD_DIR=/tmp/build_alp/ + else + echo "reuse BUILD_DIR"; + mkdir -p $BUILD_DIR + cd $BUILD_DIR && cmake $ALP_ROOT && make -j$(nproc) $TARGET_cmake && cd $CWD || { echo "codegen build failed" && exit 1; } + + fi + + # make devicecode + cd src + make $opfile devicefile="$opfile" target_cmake="$BUILD_DIR/examples/$TARGET_cmake" -f ${CWD}/Makefile || { echo "generate device code failed " && exit 1; } + cd .. + ls $opfile || { echo "$opfile not generated" && exit 1; } + + generate_host=$(pwd)/generate_host_code.py + + opfile=$(realpath $opfile) + hostfile=$(realpath $hostfile) + + # make hostcode + make $hostfile hostfile="$hostfile" generate_host="$generate_host" target_cmake="$BUILD_DIR/examples/$TARGET_cmake" host_template="$HOST_TEST_TEMPLATE" host_code_inp="$HOST_CODE_INP" ALP_ROOT="$ALP_ROOT" ASCEND_HOME_PATH="$ASCEND_HOME_PATH" ASCEND_VERSION="$ASCEND_VERSION" ASCEND_CPU_MODE="$ASCEND_CPU_MODE" -f ${CWD}/Makefile || { echo "generate host code failed " && exit 1; } + +fi + + + +mkdir -p bin +cd bin +make $TARGET target=$TARGET hostfile="$hostfile" devicefile="$opfile" ALP_ROOT="$ALP_ROOT" ASCEND_HOME_PATH="$ASCEND_HOME_PATH" ASCEND_VERSION="$ASCEND_VERSION" ASCEND_CPU_MODE="$ASCEND_CPU_MODE" -f ${CWD}/Makefile || { echo "ascend build failed" && exit 1; } +cd ../ + diff --git a/examples/unittests/compile_and_run_addOp.sh b/examples/unittests/compile_and_run_addOp.sh new file mode 100755 index 000000000..0d49b6f40 --- /dev/null +++ b/examples/unittests/compile_and_run_addOp.sh @@ -0,0 +1,42 @@ +TNAME="addOp" + +. compile.sh + +#generate input data in "input" directory +# and the reference output data in "output" directory +cd bin +rm -f runtime*.csv +for n in {0..0} +do + for axes in "256 2048" "512 2048" "1024 2048" "2048 2048" "4096 2048" "8192 2048" + do + rm -rf input output + + echo "generate input" + mkdir -p input + mkdir -p output + python3 ../make_data_addOp.py ${axes} || { echo "$TARGET make data failed" && exit 1; } + + echo "run ascend example" + echo "./$TARGET" + ./$TARGET ${axes} || { echo "$TARGET returned error" && exit 1; } + + #check the result correctness + echo "compare md5sum : ";md5sum output/*.bin + md5_ref=($(md5sum output/golden.bin)) + md5_res=($(md5sum output/param2.bin)) + RED='\033[0;31m' + GREEN='\033[0;32m' + DEF='\033[0m' + if [ "$md5_ref" == "$md5_res" ] + then + printf "${GREEN}Test OK!${DEF}\n" + else + printf "${RED}Test FAILED!${DEF}\n" + exit 1 + fi + done +done + + + diff --git a/examples/unittests/compile_and_run_addOpv1.sh b/examples/unittests/compile_and_run_addOpv1.sh new file mode 100755 index 000000000..c485286f9 --- /dev/null +++ b/examples/unittests/compile_and_run_addOpv1.sh @@ -0,0 +1,42 @@ +TNAME="addOpv1" + +. compile.sh + +#generate input data in "input" directory +# and the reference output data in "output" directory +cd bin +rm -f runtime*.csv +for n in {0..0} +do + for axes in "256" "512" "1024" "2048" "4096" "8192" + do + rm -rf input output + + echo "generate input" + mkdir -p input + mkdir -p output + python3 ../make_data_addOpv1.py ${axes} || { echo "$TARGET make data failed" && exit 1; } + + echo "run ascend example" + echo "./$TARGET" + ./$TARGET ${axes} || { echo "$TARGET returned error" && exit 1; } + + #check the result correctness + echo "compare md5sum : ";md5sum output/*.bin + md5_ref=($(md5sum output/golden.bin)) + md5_res=($(md5sum output/param2.bin)) + RED='\033[0;31m' + GREEN='\033[0;32m' + DEF='\033[0m' + if [ "$md5_ref" == "$md5_res" ] + then + printf "${GREEN}Test OK!${DEF}\n" + else + printf "${RED}Test FAILED!${DEF}\n" + exit 1 + fi + done +done + + + diff --git a/examples/unittests/compile_and_run_movedataOp-v01.sh b/examples/unittests/compile_and_run_movedataOp-v01.sh new file mode 100755 index 000000000..47acf98ea --- /dev/null +++ b/examples/unittests/compile_and_run_movedataOp-v01.sh @@ -0,0 +1,30 @@ +TNAME="movedataOpv01" + +. compile.sh + +#generate input data in "input" directory +# and the reference output data in "output" directory +cd bin/ +rm -f runtime*.csv +for n in {0..0} +do + for axes in "32 16 16" "64 16 16" "128 16 16" "256 16 16" "512 16 16" + do + echo "axes=$axes" + rm -rf input output + + echo "generate input" + mkdir -p input + mkdir -p output + python3 ../make_data_movedataOp-v01.py $axes || { echo "$TARGET data generation failed" && exit 1; } + + echo "run ascend example" + echo "./$TARGET $axes" + ./$TARGET $axes || { echo "$TARGET failed" && exit 1; } + + python3 ../check_data_movedataOp-v01.py $axes || { echo "$TARGET check failed" && exit 1; } + done +done + + + diff --git a/examples/unittests/compile_and_run_onlinesoftmaxOp.sh b/examples/unittests/compile_and_run_onlinesoftmaxOp.sh new file mode 100755 index 000000000..82c78b301 --- /dev/null +++ b/examples/unittests/compile_and_run_onlinesoftmaxOp.sh @@ -0,0 +1,26 @@ +TNAME="onlinesoftmaxOp" + +. compile.sh + +#generate input data in "input" directory +# and the reference output data in "output" directory +cd bin/ +rm -f runtime*.csv +for n in {0..0} +do + for axes in "16 32 16 16" "16 32 32 16" "16 32 32 32" "16 32 32 64" "32 16 16 64" + do + rm -rf input output + + echo "generate input" + mkdir -p input + mkdir -p output + python3 ../make_data_onlinesoftmaxOp.py $axes || { echo "$TARGET make data failed" && exit 1; } + + echo "run ascend example" + ./$TARGET $axes || { echo "$TARGET failed" && exit 1; } + + python3 ../check_data_onlinesoftmaxOp.py $axes || { echo "$TARGET check failed" && exit 1; } + done + +done diff --git a/examples/unittests/compile_and_run_softmaxOp-v1.sh b/examples/unittests/compile_and_run_softmaxOp-v1.sh new file mode 100755 index 000000000..475e6cf59 --- /dev/null +++ b/examples/unittests/compile_and_run_softmaxOp-v1.sh @@ -0,0 +1,27 @@ +TNAME="softmaxOpv1" + +. compile.sh + +#generate input data in "input" directory +# and the reference output data in "output" directory +cd bin/ +rm -f runtime*.csv +for n in {0..0} +do + for axes in "8 32 64" "8 32 128" "8 256 128" "32 128 128" + do + rm -rf input output + + echo "generate input" + mkdir -p input + mkdir -p output + python3 ../make_data_softmaxOp-v1.py $axes || { echo "$TARGET make data failed" && exit 1; } + + echo "run ascend example" + echo "./$TARGET $axes" + ./$TARGET $axes || { echo "$TARGET failed" && exit 1; } + + python3 ../check_data_softmaxOp-v1.py $axes || { echo "$TARGET check failed" && exit 1; } + done + +done diff --git a/examples/unittests/compile_and_run_softmaxOp-v3.sh b/examples/unittests/compile_and_run_softmaxOp-v3.sh new file mode 100755 index 000000000..38bc4e025 --- /dev/null +++ b/examples/unittests/compile_and_run_softmaxOp-v3.sh @@ -0,0 +1,27 @@ +TNAME="softmaxOpv3" + +. compile.sh + +#generate input data in "input" directory +# and the reference output data in "output" directory +cd bin/ +rm -f runtime*.csv +for n in {0..0} +do + for axes in "16 32 16 16" "16 32 32 16" "16 32 32 32" "16 32 32 64" "32 16 16 64" + do + rm -rf input output + + echo "generate input" + mkdir -p input + mkdir -p output + python3 ../make_data_softmaxOp-v3.py $axes || { echo "$TARGET make data failed" && exit 1; } + + echo "run ascend example" + echo "./$TARGET $axes" + ./$TARGET $axes || { echo "$TARGET failed" && exit 1; } + + python3 ../check_data_softmaxOp-v3.py $axes || { echo "$TARGET check failed" && exit 1; } + done + +done diff --git a/examples/unittests/compile_and_run_softmaxOp-v4.sh b/examples/unittests/compile_and_run_softmaxOp-v4.sh new file mode 100755 index 000000000..84a3fdf89 --- /dev/null +++ b/examples/unittests/compile_and_run_softmaxOp-v4.sh @@ -0,0 +1,27 @@ +TNAME="softmaxOpv4" + +. compile.sh + +#generate input data in "input" directory +# and the reference output data in "output" directory +cd bin/ +rm -f runtime*.csv +for n in {0..0} +do + for axes in "16 4 16 8 4 16" "16 4 16 8 4 128" "32 4 16 8 4 16" "16 4 32 8 4 16" + do + rm -rf input output + + echo "generate input" + mkdir -p input + mkdir -p output + python3 ../make_data_softmaxOp-v4.py $axes || { echo "$TARGET make data failed" && exit 1; } + + echo "run ascend example" + echo "./$TARGET $axes" + ./$TARGET $axes || { echo "$TARGET failed" && exit 1; } + + python3 ../check_data_softmaxOp-v4.py $axes || { echo "$TARGET check failed" && exit 1; } + done + +done diff --git a/examples/unittests/compile_and_run_softmaxOp.sh b/examples/unittests/compile_and_run_softmaxOp.sh new file mode 100755 index 000000000..630226558 --- /dev/null +++ b/examples/unittests/compile_and_run_softmaxOp.sh @@ -0,0 +1,30 @@ +TNAME="softmaxOp" + +. compile.sh + +#generate input data in "input" directory +# and the reference output data in "output" directory +cd bin/ +rm -f runtime*.csv +for n in {0..0} +do + for axes in "1024 32 128" "1024 128 64" "1024 128 128" "1024 256 64" + do + rm -rf input output + + echo "generate input" + mkdir -p input + mkdir -p output + python3 ../make_data_softmaxOp.py $axes || { echo "$TARGET data generation failed" && exit 1; } + + echo "run ascend example" + echo "./$TARGET $axes" + ./$TARGET $axes || { echo "$TARGET failed" && exit 1; } + + python3 ../check_data_softmaxOp.py $axes || { echo "$TARGET check failed" && exit 1; } + done + +done + + + diff --git a/examples/unittests/generate_host_code.py b/examples/unittests/generate_host_code.py new file mode 100644 index 000000000..852ca23f3 --- /dev/null +++ b/examples/unittests/generate_host_code.py @@ -0,0 +1,287 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import copy +import re +import argparse + +host_code_template="" + +class Tensor: + """A simple tensor class""" + def __init__(self,grid,axes,inout,tid): + assert( inout=="in" or inout=="out" ) + self.grid=grid + self.axes=axes + self.inout=inout + self.tid=tid + + self.paramName="param"+str(tid)+inout + self.paramHostname=self.paramName+"Host" + self.paramDevicename=self.paramName+"Device" + self.paramFileSize=self.paramName+"FileSize" + + if( inout=="out" ): + self.paramFileNameOut='"./output/param'+str(tid)+'.bin"' + else: + self.paramFileNameIn='"./input/input'+str(tid)+'.bin"' + + self.paramFileSizeExpr=" * ".join(["_n"+str(k) for k in self.axes]) + + def print(self): + print(" Tensor[ grid = " ,self.grid, " axes= ", self.axes, " inout= ", self.inout, " tid = ", self.tid , " ]" ) + + +def parse_tensor_line(problem_grid_in,LineIn): + tensors_all_str=np.array((LineIn.split()[0]).split(",")) + i1=np.where(tensors_all_str=="in")[0] + i2=np.where(tensors_all_str=="out")[0] + ii=np.sort(np.concatenate((i1,i2))) + tensors_all=np.split(tensors_all_str,ii+1)[:-1] + tensors_axes=[ np.array(a[:-1]).astype(int) for a in tensors_all] + tensors_inout=[ a[-1] for a in tensors_all] + tensors_all_list=[ Tensor(problem_grid_in,a,io,tid) for tid,(a,io) in enumerate(zip(tensors_axes,tensors_inout)) ] + return(tensors_all_list) + +def get_grid_from_mcd(grid,tabs="\t"): + s="" + for i,n in enumerate(grid): + s=s+tabs+"uint32_t _n"+str(n)+" = atoi(argv["+str(i+1)+"]);\n" + return(s) + +def get_declaretensorsizes(tensors,tabs="\t"): + s="" + for t in tensors: + s=s+tabs+"size_t "+t.paramFileSize+" = "+t.paramFileSizeExpr+" * sizeof( DTYPE );\n" + return(s) + +def get_host_alloc(tensors,tabs="\t"): + s="" + for t in tensors: + s=s+tabs+"uint8_t *"+t.paramHostname+";\n" + s=s+tabs+"CHECK_ACL(aclrtMallocHost((void**)(&"+t.paramHostname+"), "+t.paramFileSize+"));\n" + return(s) + +def get_host_readfiles(tensors,tabs="\t"): + s="" + for t in tensors: + if( t.inout == "in" ): + s=s+tabs+'ReadFile('+t.paramFileNameIn+', '+t.paramFileSize+', '+t.paramHostname+', '+t.paramFileSize+');\n' + return(s) + +def get_device_alloc(tensors,tabs="\t"): + s="" + for t in tensors: + s=s+tabs+"uint8_t *"+t.paramDevicename+";\n" + s=s+tabs+"CHECK_ACL(aclrtMalloc((void**)(&"+t.paramDevicename+"), "+t.paramFileSize+", ACL_MEM_MALLOC_HUGE_FIRST));\n" + return(s) + +def get_host2device_move(tensors,tabs="\t\t"): + s="" + for t in tensors: + if( t.inout == "in" ): + s=s+tabs+"CHECK_ACL(aclrtMemcpy("+t.paramDevicename+", "+t.paramFileSize+", "+t.paramHostname+", "+t.paramFileSize+", ACL_MEMCPY_HOST_TO_DEVICE));\n" + return(s) + +def get_devicetensor_arglist(tensors,tabs="\t\t\t"): + s=tabs+",".join([t.paramDevicename for t in tensors]) + return(s) + +def get_alldim_list(grid,tabs="\t\t\t"): + s=tabs+", ".join(["_n"+str(k) for k in grid]) + return(s) + +def get_device2host_move(tensors,tabs="\t"): + s="" + for t in tensors: + if( t.inout == "out" ): + s=s+tabs+"CHECK_ACL(aclrtMemcpy("+t.paramHostname+", "+t.paramFileSize+", "+t.paramDevicename+", "+t.paramFileSize+", ACL_MEMCPY_DEVICE_TO_HOST));\n" + return(s) + +def get_device_free(tensors,tabs="\t"): + s="" + for t in tensors: + s=s+tabs+"CHECK_ACL(aclrtFree("+t.paramDevicename+"));\n" + return(s) + +def get_host_free(tensors,tabs="\t"): + s="" + for t in tensors: + s=s+tabs+"CHECK_ACL(aclrtFreeHost("+t.paramHostname+"));\n" + return(s) + +def get_host_write(tensors,tabs="\t"): + s="" + for t in tensors: + if( t.inout == "out" ): + s=s+tabs+'WriteFile('+t.paramFileNameOut+', '+t.paramHostname+', '+t.paramFileSize+');\n' + return(s) + +def get_frwdec_tensorlist(tensors,tabs="\t"): + s=tabs+", ".join([ "uint8_t *"+t.paramName for t in tensors]) + return(s) + +def get_frwdec_alldim_list(grid,tabs="\t"): + s=tabs+", ".join(["uint32_t n"+str(k) for k in grid]) + return(s) + +def get_frwdec_all_thrd_dim_list(grid,tabs="\t"): + s=tabs+", ".join(["uint32_t _p"+str(k) for k in grid]) + return(s) + +############## cpu code gen ################## + +def get_cpu_alloc(tensors,tabs="\t"): + s="" + for t in tensors: + s=s+tabs+"uint8_t* "+t.paramName+" = (uint8_t*)AscendC::GmAlloc("+t.paramFileSize+");\n" + return(s) + +def get_cpu_readfiles(tensors,tabs="\t"): + s="" + for t in tensors: + if( t.inout == "in" ): + s=s+tabs+'ReadFile('+t.paramFileNameIn+', '+t.paramFileSize+', '+t.paramName+', '+t.paramFileSize+');\n' + return(s) + +def get_cputensor_arglist(tensors,tabs="\t\t"): + s=tabs+",".join([t.paramName for t in tensors]) + return(s) + +def get_cpu_write(tensors,tabs="\t"): + s="" + for t in tensors: + if( t.inout == "out" ): + s=s+tabs+'WriteFile('+t.paramFileNameOut+', '+t.paramName+', '+t.paramFileSize+');\n' + return(s) + +def get_cpu_free(tensors,tabs="\t"): + s="" + for t in tensors: + s=s+tabs+"AscendC::GmFree((void *)"+t.paramName+");\n" + return(s) + +def get_cpu_frwdec_tensorlist(tensors,tabs="\t"): + s=tabs+", ".join([ "GM_ADDR "+t.paramName for t in tensors]) + return(s) + +def get_cpu_frwdec_alldim_list(grid,tabs="\t"): + s=tabs+", ".join(["uint32_t n"+str(k) for k in grid]) + return(s) + +def get_cpu_frwdec_all_thrd_dim_list(grid,tabs="\t"): + s=tabs+", ".join(["uint32_t _p"+str(k) for k in grid]) + return(s) + +def get_analytic_mdel_arg_list(t,tabs=""): + s=tabs+", ".join([x.split()[-1] for x in t.strip().split(",") if x]) + return(s) + +def get_analytic_model_init(t,tcode,tabs="\t"): + s="" + s=s+"\n#ifdef _ANALYTIC_MODEL_\n" + s=s+"\n".join(tcode) + s=s+"\n#else\n" + s=s+";\n ".join([tabs+x.strip()+" = 1;" for x in t.strip().split(",") if x]) + s=s+"\n#endif\n" + return(s) + + + +parser = argparse.ArgumentParser(description='Generate host test code.') +parser.add_argument('template_file', type=str, nargs='+', + help='host code will be generated from this template') +parser.add_argument('out_file', type=str, nargs='+', + help='generated host code file name') +parser.add_argument('in_file', type=str, nargs='+', + help='input paramters') +parser.add_argument('repeats', type=str, nargs='+', default="10", + help='number or repeats in the unit tests') +parser.add_argument('device_id', type=str, nargs='+', default="0", + help='device id used in tests') +parser.add_argument('nthreads', type=str, nargs='+', default="8", + help='number of threads used in tests') +args = parser.parse_args() + +template_file=args.template_file[0] +out_file=args.out_file[0] +in_file=args.in_file[0] +repeats=args.repeats[0] +device_id=args.device_id[0] +nthreads=args.nthreads[0] + +print("args.template_file=",args.template_file) +print("args.out_file=",args.out_file) +print("args.in_file=",args.in_file) +print("args.repeats=",args.repeats) +print("args.device_id=",args.device_id) +print("args.nthreads=",args.nthreads) + +file1 = open(in_file, 'r') +Lines = file1.readlines() +file1.close() + +thread_grid=np.array((Lines[0].split()[0]).split(",")).astype(int) +problem_grid=np.array((Lines[1].split()[0]).split(",")).astype(int) +tensors_all=parse_tensor_line(problem_grid,Lines[2]) +kernel_name=Lines[3].split()[0] +analyticModelFormalParams=Lines[4] +i1=np.where( [ "BEGIN_ANALYTIC_MODEL" in l for l in Lines ] )[0][0] +i2=np.where( [ "END_ANALYTIC_MODEL" in l for l in Lines ] )[0][0] +analyticModelInitCode=Lines[i1+1:i2] +print("kernel_name =",kernel_name) +print("thread_grid =",thread_grid) +print("problem_grid =",problem_grid) +print("analyticModelFormalParams =",analyticModelFormalParams) +print("analyticModelInitCode =",analyticModelInitCode) +print("tensors =") +for t in tensors_all: + t.print() + +replace_rules=[ + ("##KERNELNAME##",kernel_name), + ("##REPEATS##",repeats), + ("##DEVICEID##",device_id), + ("##DECLARESIZES##",get_grid_from_mcd(problem_grid)), + ("##NTHREADS##",nthreads), + ("##DECLARETENSORSIZES##",get_declaretensorsizes(tensors_all)), + ("##HOSTDECLARETENSOR##",get_host_alloc(tensors_all)), + ("##HOSTREADFILES##",get_host_readfiles(tensors_all)), + ("##DEVICEDECLARETENSOR##",get_device_alloc(tensors_all)), + ("##HOST2DEVICEMOVE##",get_host2device_move(tensors_all)), + ("##DEVICETENSORLIST##",get_devicetensor_arglist(tensors_all)), + ("##ALLDIMENSIONSLIST##",get_alldim_list(problem_grid)), + ("##DEVICE2HOSTMOVE##",get_device2host_move(tensors_all)), + ("##DEVICEFREETENSOR##",get_device_free(tensors_all)), + ("##HOSTFREETENSOR##",get_host_free(tensors_all)), + ("##WRITETENSOR##",get_host_write(tensors_all)), + ("##FRWDECTENSORALLLIST##",get_frwdec_tensorlist(tensors_all)), + ("##FRWDECTENSORSIZESLIST##",get_frwdec_alldim_list(problem_grid)), + ("##FRWDECTHRDGRIDLIST##",get_frwdec_all_thrd_dim_list(thread_grid)), + ("##CPUDECLARETENSOR##",get_cpu_alloc(tensors_all)), + ("##CPUREADFILES##",get_cpu_readfiles(tensors_all)), + ("##CPUTENSORLIST##",get_cputensor_arglist(tensors_all)), + ("##CPUWRITETENSOR##",get_cpu_write(tensors_all)), + ("##CPUFREETENSOR##",get_cpu_free(tensors_all)), + ("##CPUFRWDECTENSORALLLIST##",get_cpu_frwdec_tensorlist(tensors_all)), + ("##CPUFRWDECTENSORSIZESLIST##",get_cpu_frwdec_alldim_list(problem_grid)), + ("##CPUFRWDECTHRDGRIDLIST##",get_cpu_frwdec_all_thrd_dim_list(thread_grid)), + ("##ANALYTICMODELFORMALPARAMS##","\t"+analyticModelFormalParams), + ("##ANALYTICMODELPARAMS##",get_analytic_mdel_arg_list(analyticModelFormalParams)), + ("##DECLAREANALYTICMODELPARAMS##",get_analytic_model_init(analyticModelFormalParams,analyticModelInitCode)) + +] + +file1 = open(template_file, 'r') +Lines = file1.readlines() + +text=copy.copy(Lines) +for old,new in replace_rules: + for i in range(len(text)): + text[i] = text[i].replace(old,new) + +# writing to file +file1 = open(out_file, 'w') +file1.writelines(text) +file1.close() diff --git a/examples/unittests/make_data_addOp.py b/examples/unittests/make_data_addOp.py new file mode 100644 index 000000000..174a0581d --- /dev/null +++ b/examples/unittests/make_data_addOp.py @@ -0,0 +1,25 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import sys + +def gen_golden_data_simple(): + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + + N = n0*n1 + + input_x = np.random.uniform(-100, 100, N ).astype(np.float16) + input_y = np.random.uniform(-100, 100, N ).astype(np.float16) + golden = (input_x + input_y).astype(np.float16) + + input_x.tofile("./input/input0.bin") + input_y.tofile("./input/input1.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + assert(len(sys.argv)==3) + gen_golden_data_simple() diff --git a/examples/unittests/make_data_addOpv1.py b/examples/unittests/make_data_addOpv1.py new file mode 100644 index 000000000..0e6bb99cd --- /dev/null +++ b/examples/unittests/make_data_addOpv1.py @@ -0,0 +1,24 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import sys + +def gen_golden_data_simple(): + + n0=int(sys.argv[1]) + + N = n0 + + input_x = np.random.uniform(-100, 100, N ).astype(np.float16) + input_y = np.random.uniform(-100, 100, N ).astype(np.float16) + golden = (input_x + input_y).astype(np.float16) + + input_x.tofile("./input/input0.bin") + input_y.tofile("./input/input1.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + assert(len(sys.argv)==2) + gen_golden_data_simple() diff --git a/examples/unittests/make_data_movedataOp-v01.py b/examples/unittests/make_data_movedataOp-v01.py new file mode 100644 index 000000000..80831b372 --- /dev/null +++ b/examples/unittests/make_data_movedataOp-v01.py @@ -0,0 +1,42 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import copy +import sys + +def gen_golden_data(): + x1_gm_type = np.float16 + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + + M = n0*n1 + N = n2 + + x1_gm = np.random.randint(1, 10, [M, N]).astype(x1_gm_type) + + S = copy.copy(x1_gm) + + # S=softmax(S) + # rowmaxS=np.max(S,axis=1) + # S=S-np.tile(rowmaxS, (np.shape(S)[1],1)).T + # S=np.exp(S) + # rowsumS=1/np.sum(S,axis=1) + # S=S*(np.tile(rowsumS, (np.shape(S)[1],1)).T) + + golden = S + + infilename = "./input/input0.bin" + outfilename = "./output/golden.bin" + + x1_gm.tofile( infilename ) + golden.tofile( outfilename ) + + print(f"I/O of size {M} x {N} and type {x1_gm_type} generated in {infilename} and {outfilename}") + # print( golden ) + +if __name__ == "__main__": + assert(len(sys.argv)==4) + gen_golden_data() diff --git a/examples/unittests/make_data_onlinesoftmaxOp.py b/examples/unittests/make_data_onlinesoftmaxOp.py new file mode 100644 index 000000000..0dfd47e54 --- /dev/null +++ b/examples/unittests/make_data_onlinesoftmaxOp.py @@ -0,0 +1,124 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import copy +import sys + +def gen_golden_data(): + x1_gm_type = np.float16 + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + n3=int(sys.argv[4]) + n4=16 + + N0 = n0*n2*n4 + N1 = n1*n3*n4 + + shape0 = (n0,n2,n4) + shape1 = (n1,n3,n4) + shape2 = (n0,n1,n2,n3) + shape3 = (n0,n2) + + Q_gm = np.random.randint(1, 4, [N0]).astype(x1_gm_type) + V_gm = np.random.randint(1, 4, [N1]).astype(x1_gm_type) + K_gm = np.random.randint(1, 4, [N1]).astype(x1_gm_type) + + Q = copy.copy(Q_gm) + Q = np.reshape(Q,shape0) + #infilename = "./input/q_gm.bin" + #Q_gm.tofile( infilename ) + + K = copy.copy(K_gm) + K = np.reshape(K,shape1) + #infilename = "./input/k_gm.bin" + #K_gm.tofile( infilename ) + + V = copy.copy(V_gm) + V = np.reshape(V,shape1) + #infilename = "./input/v_gm.bin" + #V_gm.tofile( infilename ) + + + S0 = np.zeros(shape2).astype(x1_gm_type) + for i0 in range(n0): + for i1 in range(n1): + S0[i0,i1,:,:]=Q[i0,:,:].dot(V[i1,:,:].T) + S0_gm = np.reshape(S0,n0*n1*n2*n3) + #infilename = "./input/s0_gm.bin" + infilename = "./input/input2.bin" + S0_gm.tofile( infilename ) + + m0 = np.zeros(shape3).astype(x1_gm_type) + l0 = np.zeros(shape3).astype(x1_gm_type) + S1 = np.zeros(shape2).astype(x1_gm_type) + for i0 in range(n0): + l0[i0,:]=0 + m0[i0,:]=-65504.0 + for i1 in range(n1): + Sij=S0[i0,i1,:,:] + # Pi=Sij + mi_old=copy.copy(m0[i0,:]) + + rowmaxS=np.max(Sij,axis=1) + m0[i0,:]=np.maximum(m0[i0,:],rowmaxS) # m0[i0,:]=rowmaxS # TEMP + mi_bcast=np.tile(m0[i0,:], (np.shape(Sij)[1],1)) + Pi=Sij-mi_bcast.T + Pi=np.exp(Pi) + + expmidiff=np.exp(mi_old-m0[i0,:]) + l0[i0,:]*=expmidiff + l0[i0,:]+=np.sum(Pi,axis=1) + + S1[i0,i1,:,:]=Pi + + + + + # print("m0=",m0) + # print("l0=",l0) + m0_gm = np.reshape(m0,n0*n2) + l0_gm = np.reshape(l0,n0*n2) + s1_gm = np.reshape(S1,n0*n1*n2*n3) + goldenfilename = "./output/m0_golden.bin" + m0_gm.tofile( goldenfilename ) + goldenfilename = "./output/l0_golden.bin" + l0_gm.tofile( goldenfilename ) + goldenfilename = "./output/s1_golden.bin" + s1_gm.tofile( goldenfilename ) + + S2 = np.zeros(shape2).astype(x1_gm_type) + for i0 in range(n0): + for i1 in range(n1): + S2[i0,i1,:,:]=Q[i0,:,:].dot(V[i1,:,:].T) + S2_gm = np.reshape(S2,n0*n1*n2*n3) + infilename = "./input/s2_gm.bin" + S2_gm.tofile( infilename ) + + # S = copy.copy(x1_gm) + # S = np.reshape(S,(n0,n1,n2,n3)) + + # # S=block_softmax(S) + # for i0 in range(n0): + # for i1 in range(n1): + # Stmp=S[i0,i1,:,:] + # rowmaxS=np.max(Stmp,axis=1) + # Stmp=Stmp-np.tile(rowmaxS, (np.shape(Stmp)[1],1)).T + # Stmp=np.exp(Stmp) + # rowsumS=1/np.sum(Stmp,axis=1) + # Stmp=Stmp*(np.tile(rowsumS, (np.shape(Stmp)[1],1)).T) + # S[i0,i1,:,:]=Stmp + + # golden = S + + # outfilename = "./output/golden.bin" + # golden.tofile( outfilename ) + + # print(f"I/O of size {n0} x {n1} x {n2} x {n3} and type {x1_gm_type} generated in {infilename} and {outfilename}") + # # print( golden ) + +if __name__ == "__main__": + assert(len(sys.argv)==5) + gen_golden_data() diff --git a/examples/unittests/make_data_softmaxOp-v1.py b/examples/unittests/make_data_softmaxOp-v1.py new file mode 100644 index 000000000..e041b891a --- /dev/null +++ b/examples/unittests/make_data_softmaxOp-v1.py @@ -0,0 +1,42 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import copy +import sys + +def gen_golden_data(): + x1_gm_type = np.float16 + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + + M = n0*n1 + N = n2 + + x1_gm = np.random.randint(1, 10, [M, N]).astype(x1_gm_type) + + S = copy.copy(x1_gm) + + # S=softmax(S) + rowmaxS=np.max(S,axis=1) + S=S-np.tile(rowmaxS, (np.shape(S)[1],1)).T + S=np.exp(S) + rowsumS=1/np.sum(S,axis=1) + S=S*(np.tile(rowsumS, (np.shape(S)[1],1)).T) + + golden = S + + infilename = "./input/input0.bin" + outfilename = "./output/golden.bin" + + x1_gm.tofile( infilename ) + golden.tofile( outfilename ) + + print(f"I/O of size {M} x {N} and type {x1_gm_type} generated in {infilename} and {outfilename}") + # print( golden ) + +if __name__ == "__main__": + assert(len(sys.argv)==4) + gen_golden_data() diff --git a/examples/unittests/make_data_softmaxOp-v3.py b/examples/unittests/make_data_softmaxOp-v3.py new file mode 100644 index 000000000..772267d80 --- /dev/null +++ b/examples/unittests/make_data_softmaxOp-v3.py @@ -0,0 +1,48 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import copy +import sys + +def gen_golden_data(): + x1_gm_type = np.float16 + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + n3=int(sys.argv[4]) + + M = n0*n1 + N = n2*n3 + + x1_gm = np.random.randint(1, 10, [N, M]).astype(x1_gm_type) + + S = copy.copy(x1_gm) + S = np.reshape(S,(n0,n1,n2,n3)) + + # S=block_softmax(S) + for i0 in range(n0): + for i1 in range(n1): + Stmp=S[i0,i1,:,:] + rowmaxS=np.max(Stmp,axis=1) + Stmp=Stmp-np.tile(rowmaxS, (np.shape(Stmp)[1],1)).T + Stmp=np.exp(Stmp) + rowsumS=1/np.sum(Stmp,axis=1) + Stmp=Stmp*(np.tile(rowsumS, (np.shape(Stmp)[1],1)).T) + S[i0,i1,:,:]=Stmp + + golden = S + + infilename = "./input/input0.bin" + outfilename = "./output/golden.bin" + + x1_gm.tofile( infilename ) + golden.tofile( outfilename ) + + print(f"I/O of size {n0} x {n1} x {n2} x {n3} and type {x1_gm_type} generated in {infilename} and {outfilename}") + # print( golden ) + +if __name__ == "__main__": + assert(len(sys.argv)==5) + gen_golden_data() diff --git a/examples/unittests/make_data_softmaxOp-v4.py b/examples/unittests/make_data_softmaxOp-v4.py new file mode 100644 index 000000000..73fd7e416 --- /dev/null +++ b/examples/unittests/make_data_softmaxOp-v4.py @@ -0,0 +1,52 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import copy +import sys + +def gen_golden_data(): + x1_gm_type = np.float16 + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + n3=int(sys.argv[4]) + n4=int(sys.argv[5]) + n5=int(sys.argv[6]) + + N = n0*n1*n2*n3*n4*n5 + + x1_gm = np.random.randint(1, 10, [N]).astype(x1_gm_type) + + S = copy.copy(x1_gm) + + S=np.reshape(S,(n0,n1,n2,n3,n4,n5)) + for i0 in range(n0): + for i1 in range(n1): + for i3 in range(n3): + for i4 in range(n4): + Stmp=S[i0,i1,:,i3,i4,:] + rowmaxStmp=np.max(Stmp,axis=1) + Stmp=Stmp-np.tile(rowmaxStmp, (np.shape(Stmp)[1],1)).T + Stmp=np.exp(Stmp) + rowsumStmp=1/np.sum(Stmp,axis=1) + Stmp=Stmp*(np.tile(rowsumStmp, (np.shape(Stmp)[1],1)).T) + S[i0,i1,:,i3,i4,:]=Stmp + + + golden = np.reshape(S,(n0*n1*n2*n3*n4*n5)) + + infilename = "./input/input0.bin" + outfilename = "./output/golden.bin" + + x1_gm.tofile( infilename ) + golden.tofile( outfilename ) + + print(f"I/O of size {n0} x {n1} x {n2} x {n3} x {n4} x {n5} and type {x1_gm_type} generated in {infilename} and {outfilename}") + # print( golden ) + + +if __name__ == "__main__": + assert(len(sys.argv)==7) + gen_golden_data() diff --git a/examples/unittests/make_data_softmaxOp.py b/examples/unittests/make_data_softmaxOp.py new file mode 100644 index 000000000..28ef13518 --- /dev/null +++ b/examples/unittests/make_data_softmaxOp.py @@ -0,0 +1,42 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import copy +import sys + +def gen_golden_data(): + x1_gm_type = np.float16 + + n0=int(sys.argv[1]) + n1=int(sys.argv[2]) + n2=int(sys.argv[3]) + + M = n0*n1 + N = n2 + + x1_gm = np.random.randint(1, 10, [M, N]).astype(x1_gm_type) + + S = copy.copy(x1_gm) + + # S=softmax(S) + rowmaxS=np.max(S,axis=1) + S=S-np.tile(rowmaxS, (np.shape(S)[1],1)).T + S=np.exp(S) + rowsumS=1/np.sum(S,axis=1) + S=S*(np.tile(rowsumS, (np.shape(S)[1],1)).T) + + golden = S + + infilename = "./input/input0.bin" + outfilename = "./output/golden.bin" + + x1_gm.tofile( infilename ) + golden.tofile( outfilename ) + + print(f"I/O of size {M} x {N} and type {x1_gm_type} generated in {infilename} and {outfilename}") + # print( golden ) + +if __name__ == "__main__": + assert(len(sys.argv)==4) + gen_golden_data() diff --git a/examples/unittests/test_all.sh b/examples/unittests/test_all.sh new file mode 100755 index 000000000..162eced88 --- /dev/null +++ b/examples/unittests/test_all.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +SCRIPTS="compile_and_run_movedataOp-v01.sh compile_and_run_addOp.sh compile_and_run_addOpv1.sh compile_and_run_softmaxOp.sh compile_and_run_softmaxOp-v1.sh compile_and_run_softmaxOp-v3.sh compile_and_run_softmaxOp-v4.sh compile_and_run_onlinesoftmaxOp.sh" + +RED='\033[0;31m' +GREEN='\033[0;32m' +DEF='\033[0m' + +echo "" + +BUILD=$(pwd)/build_alp/ +rm -rf $BUILD +mkdir $BUILD + +PAD_LEN=$(for script in $SCRIPTS ; do echo $script ; done | wc --max-line-length) +PAD_LEN="$((PAD_LEN-16))" + +for script in $SCRIPTS +do + testname=$(echo -n ${script:16:-3}) + BUILD_DIR=$BUILD ./$script 2&>> /dev/null + if [ $? -ne 0 ] + then + printf "%-${PAD_LEN}s ${RED}FAILED${DEF} \n" $testname + exit 1 + else + printf "%-${PAD_LEN}s ${GREEN}PASSED${DEF} \n" $testname + fi +done +echo -e "\nAll tests OK!" +rm -rf $BUILD diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index 2511528ee..5f5cc2a51 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -40,6 +40,9 @@ set( HEADERS_REGEX ".+\.(hpp|h|hxx|hh|h\\+\\+)$" ) # to avoid flaky acrobatics with regex or glob expressions, copy main files directly install( FILES "graphblas.hpp" DESTINATION "${INCLUDE_INSTALL_DIR}" ) +if( WITH_ASCEND_BACKEND ) + install( FILES "alpAscend.hpp" DESTINATION "${INCLUDE_INSTALL_DIR}" ) +endif() set( root_files "graphblas/backends.hpp" "graphblas/benchmark.hpp" "graphblas/blas0.hpp" "graphblas/blas1.hpp" "graphblas/blas2.hpp" @@ -169,6 +172,21 @@ if( WITH_NONBLOCKING_BACKEND ) install( TARGETS backend_nonblocking_headers EXPORT GraphBLASTargets ) endif() +if( WITH_ASCEND_BACKEND ) + add_library( backend_ascend_headers INTERFACE ) + target_link_libraries( backend_ascend_headers INTERFACE backend_reference_headers ) + target_compile_definitions( backend_ascend_headers INTERFACE "${ASCEND_INCLUDE_DEFS}" ) + target_include_directories( backend_ascend_headers INTERFACE + $ + $ + ) + install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/ascend/" + DESTINATION "${GRB_INCLUDE_INSTALL_DIR}/ascend" + FILES_MATCHING REGEX "${HEADERS_REGEX}" + ) + install( TARGETS backend_ascend_headers EXPORT GraphBLASTargets ) +endif() + if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND ) # copy headers, which are common to both distributed backends install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/graphblas/bsp/" diff --git a/include/alpAscend.hpp b/include/alpAscend.hpp new file mode 100644 index 000000000..f20da3b2c --- /dev/null +++ b/include/alpAscend.hpp @@ -0,0 +1,344 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * The main header to include in order to use ALP/Ascend codegen. + * + * @author A. N. Yzelman. + * @date 12th of September, 2023. + */ + +#ifndef _H_ALPASCEND +#define _H_ALPASCEND + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/** + * \defgroup ALPAscend ALP/Ascend + * + * This the ALP/Ascend module. + * + * @{ + */ + +namespace alp +{ + namespace internal + { + extern iGrid *igrid; + extern AscendLazyEvaluation ale; + extern SymbolTable symbols; + } +} + +/** The ALP/Ascend namespace */ +namespace alp { + + using grb::RC; + + using grb::toString; + + namespace internal { + + template< size_t process_mesh_order, size_t problem_mesh_order > + using AscendCodeFunction = void (*) ( + const alp::Grid< process_mesh_order, problem_mesh_order > &, + alp::RC & + ); + + } + + template< size_t process_mesh_order, size_t problem_mesh_order > + static grb::RC compile( + const internal::AscendCodeFunction< + process_mesh_order, + problem_mesh_order + > ascend_code, + const std::string &kernel_name + ) { + grb::RC ret = grb::PANIC; + grb::Launcher< grb::EXEC_MODE::AUTOMATIC > launcher; + alp::Grid< process_mesh_order, problem_mesh_order > grid; + + alp::internal::igrid = + new alp::internal::iGrid( process_mesh_order, problem_mesh_order ); + + internal::OpGen::kernel_id = kernel_name; + + std::ofstream output_device_code; + output_device_code.open ( internal::OpGen::kernel_id + "_npu_op.cpp", std::ofstream::out | std::ofstream::trunc); + + std::ofstream output_host_log; + output_host_log.open ( "generate_host_code_" + internal::OpGen::kernel_id + ".inp", std::ofstream::out | std::ofstream::trunc); + + output_host_log << "0"; + for( size_t i = 1; i < process_mesh_order; ++i ) { + output_host_log << "," << i; + } + output_host_log << std::endl; + + output_host_log << "0"; + for( size_t i = 1; i < problem_mesh_order; ++i ) { + output_host_log << "," << i; + } + output_host_log << std::endl; + + // TODO perhaps the processSize and problemSize members should be generated + // more than once, for every forEach + // only the tile_num is the same? + + // const uint32_t _p0 + internal::OpGen::hostFormalParam << "const uint32_t _" << alp::internal::igrid->processSize( 0 ); + + // , const uint32_t _p1, const uint32_t _p2, const uint32_t _p3 ... + for( size_t i = 1; i < process_mesh_order; ++i ) { + internal::OpGen::hostFormalParam << ", const uint32_t _" << alp::internal::igrid->processSize( i ); + } + + // , const uint32_t _n0, const uint32_t _n1, const uint32_t _n2 + for( size_t i = 0; i < problem_mesh_order; ++i ) { + internal::OpGen::hostFormalParam << ", const uint32_t _" << alp::internal::igrid->problemSize( i ); + } + + // _p0 + internal::OpGen::hostArg << "_" << alp::internal::igrid->processSize( 0 ); + + // , _p1, _p2, _p3 ... + for( size_t i = 1; i < process_mesh_order; ++i ) { + internal::OpGen::hostArg << ", _" << alp::internal::igrid->processSize( i ); + } + + // , _n0, _n1, _n2 ... + for( size_t i = 0; i < problem_mesh_order; ++i ) { + internal::OpGen::hostArg << ", _" << alp::internal::igrid->problemSize( i ); + } + + // p0 = _p0; + // p1 = _p1; + // p2 = _p2; + // ... + // when i < process_mesh_order + for( size_t i = 0; i < process_mesh_order; ++i ) { + internal::OpGen::constrBody << "\n"; + internal::OpGen::constrBody << "\t\t\t" + << alp::internal::igrid->processSize( i ) + << " = _" << alp::internal::igrid->processSize( i ) + << ";"; + } + + // p1 = 1; + // p2 = 1; + // ... + // when process_mesh_order <= i < problem_mesh_order + for( size_t i = process_mesh_order; i < problem_mesh_order; ++i ) { + internal::OpGen::constrBody << "\n"; + internal::OpGen::constrBody << "\t\t\t" + << alp::internal::igrid->processSize( i ) + << " = 1;"; + } + + internal::OpGen::constrBody << "\n"; + + // n0 = _n0; + // n1 = _n1; + // n2 = _n2; + // ... + for( size_t i = 0; i < problem_mesh_order; ++i ) { + internal::OpGen::constrBody << "\n"; + internal::OpGen::constrBody << "\t\t\t" + << alp::internal::igrid->problemSize( i ) << " = _" + << alp::internal::igrid->problemSize( i ) << ";"; + } + + internal::OpGen::constrBody << "\n"; + + // uint32_t p0; + // uint32_t p1; + // uint32_t p2; + for( size_t i = 0; i < problem_mesh_order; ++i ) { + internal::OpGen::classMembers << "\t\tuint32_t " + << alp::internal::igrid->processSize( i ) << ";\n"; + } + + internal::OpGen::classMembers << "\n"; + + // uint32_t n0; + // uint32_t n1; + // uint32_t n2; + for( size_t i = 0; i < problem_mesh_order; ++i ) { + internal::OpGen::classMembers << "\t\tuint32_t " + << alp::internal::igrid->problemSize( i ) << ";\n"; + } + + internal::OpGen::classMembers << "\n"; + + const RC launch_rc = launcher.exec< + alp::Grid< process_mesh_order, problem_mesh_order >, + alp::RC + > ( + ascend_code, grid, ret, true + ); + if( launch_rc != grb::SUCCESS ) { + throw std::runtime_error( "Launching codegen FAILED" ); + } + + // ANALYTIC MODEL + { + std::stringstream analyticModelArgs; + std::stringstream analyticModelFormalParams; + std::stringstream analyticModelDecls; + std::stringstream analyticModelConstrBody; + + // host body generation appends to hostArgs, so the below line must follow the previous one(!) + alp::internal::ale.generateHostBody( internal::OpGen::hostBody, + analyticModelArgs, analyticModelFormalParams, + analyticModelDecls, analyticModelConstrBody ); + + internal::OpGen::hostArg << analyticModelArgs.str(); + internal::OpGen::analyticModelFormalParams << analyticModelFormalParams.str(); + internal::OpGen::classMembers << analyticModelDecls.str(); + internal::OpGen::constrBody << analyticModelConstrBody.str(); + } + + /* + * Only once we are here we have execute all the forEach, + * and thus we have all the information we need to generate + * code and performs optimizations, especially across + * different forEach, and including handling multiple + * pipelines that may be built by the same forEach + * + */ + +// alp::internal::symbols.debug_print(); +// alp::internal::ale.debug_print(); + + // CLASS MEMBER DECLARATIONS + { + std::stringstream decl; + alp::internal::ale.generateDeclarations( decl ); + internal::OpGen::declarations << decl.str(); + } + + // CONSTRUCTOR BODY +// { +// std::stringstream constructor; +// alp::internal::ale.generateConstructor( constructor ); +// internal::OpGen::constrBody << constructor.str(); +// } + + // INIT BODY + { + if( alp::internal::symbols.existsTBufTensorDecl() == true ) { + + //TODO I should make the datatype a parameter + std::string temp_data_type = "half"; + std::stringstream max_n; +/* + max_n << "std::max( { " << alp::internal::igrid->problemSize( 0 ); + + for( size_t i = 1; i < problem_mesh_order; ++i ) { + max_n << ", " << alp::internal::igrid->problemSize( i ); + } + + // close all open parentheses + max_n << " } )"; +*/ + if( problem_mesh_order == 1 ) { + max_n << "" << alp::internal::igrid->problemSize( 0 ) << ""; + } else { + max_n << "alp::max( " << alp::internal::igrid->problemSize( 0 ) << ", "; + + for( size_t i = 1; i < problem_mesh_order - 1; ++i ) { + max_n << "alp::max( " << alp::internal::igrid->problemSize( i ) << ", "; + } + + // this corresponds to the last one, which is a special case + // since it doesn't open a new recursive std::max + max_n << alp::internal::igrid->problemSize( problem_mesh_order - 1 ); + + // close all open parentheses + for( size_t i = 1; i < problem_mesh_order; ++i ) { + max_n << " )"; + } + } + + internal::OpGen::initBody << "\n"; + internal::OpGen::initBody << "\t\t\tint32_t totWorkSpaceSize = alp::computeBufferSize( " << max_n.str() << ", sizeof( " << temp_data_type << " ) );\n"; + + } + + std::stringstream init; + alp::internal::ale.generateInit( init ); + internal::OpGen::initBody << init.str(); + + if( alp::internal::symbols.existsTBufTensorDecl() == true ) { + std::stringstream temp_local_init; + alp::internal::symbols.generateTempLocalInit( temp_local_init ); + internal::OpGen::initBody << temp_local_init.str(); + } + } + + // PROCESS + { + std::stringstream process, processCall; + alp::internal::ale.generateProcess( process, processCall ); + internal::OpGen::processFunc.push_back( std::move( process ) ); + internal::OpGen::genericProcessBody << processCall.str(); + } + + alp::internal::OpGen::generate( output_device_code ); + + std::stringstream listOfGlobalTensors; + alp::internal::symbols.printHostLogFile( listOfGlobalTensors ); + output_host_log << listOfGlobalTensors.str() << std::endl; + + output_host_log << internal::OpGen::kernel_id << std::endl; + + output_host_log << internal::OpGen::analyticModelFormalParams.str() << std::endl; + + output_host_log << "$BEGIN_ANALYTIC_MODEL" << std::endl; + output_host_log << internal::OpGen::hostBody.str(); + output_host_log << "$END_ANALYTIC_MODEL" << std::endl; + + output_device_code.close(); + output_host_log.close(); + + internal::OpGen::compileClear(); + + delete alp::internal::igrid; + + return ret; + } + +} + +/** @} */ + +#endif // end _H_ALPASCEND + diff --git a/include/asclib/analytic_model.hpp b/include/asclib/analytic_model.hpp new file mode 100644 index 000000000..37bff0b1c --- /dev/null +++ b/include/asclib/analytic_model.hpp @@ -0,0 +1,496 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * The analytic model to be used by the AscendC code, at operator run-time. + * + * @author A. N. Yzelman + * @date 25th of October, 2023 + */ + +#ifndef _H_ALP_ASCEND_ANALYTIC_MODEL +#define _H_ALP_ASCEND_ANALYTIC_MODEL + +#include +#include +#include +#include // TODO FIXME factor this out -- too high runtime overhead +#include + +#ifdef _DEBUG + #include +#endif + +#include + +#ifndef ASC_FORCE_BINARY_SEARCH + /** Set this macro to true to force a binary search */ + #define ASC_FORCE_BINARY_SEARCH false +#endif + + +/** The ALP@Ascend namespace for run-time components. */ +namespace asc { + + /** + * The analytic model is parametrised in the dimensionality of the process + * mesh and the problem mesh. + * + * For the tensors that are in the pipeline, it furthermore requires static + * knowledge on whether the dynamic axes (the axes over which the user program + * iterates) of the tensors involved with the pipeline, differ. + */ + template< size_t process_order, size_t problem_order, bool has_differing_dyn_axes > + class AnalyticModel { + + private: + + /** Whether to force a binary search */ + static constexpr const bool force_binary = ASC_FORCE_BINARY_SEARCH; + + /** The total scratchpad space, in bytes. */ + const size_t totalSpace; + + std::array< size_t, process_order > processSizes; + + std::array< size_t, problem_order > problemSizes; + + std::array< bool, problem_order > iterationAxes; + + std::vector< std::pair< std::vector< unsigned int >, size_t > > tensors; + + std::array< size_t, problem_order > blockLengths; + + std::vector< unsigned int > largestDynamicAxes; + + size_t largestSize; + + size_t largestStaticSize; + + size_t totalStaticSize; + + /** The size of buffers used by the AscendC program. */ + size_t bufferSize; + + unsigned int numStages; + + unsigned int nDynamicAxes; + + /** Whether the block lengths have been computed. */ + bool lock; + + /** Checks whether current block lengths overrun the buffer */ + bool feasible() const { + const size_t effectiveBufferSize = totalSpace - bufferSize; + size_t required = 0; + for( const auto &pair : tensors ) { + size_t size = pair.second; + for( const auto &dyn_axis : pair.first ) { + const size_t tileSize = std::max( 1ul, blockLengths[ dyn_axis ] ); + size *= tileSize; + } + required += size; + } +#ifdef _DEBUG + std::cout << "\t\tfeasibility of current solution: " << required << " <= " + << effectiveBufferSize << "\n"; +#endif + return required <= effectiveBufferSize; + } + + void analyticSolve() { + const size_t n = tensors.size(); + const size_t effectiveBufferSize = totalSpace - bufferSize; + const size_t maxMul = effectiveBufferSize / totalStaticSize; + const unsigned int d = largestDynamicAxes.size(); +#ifdef _DEBUG + std::cout << "\tanalyticSolve called with n = " << n << ", " + << "effectiveBufferSize = " << effectiveBufferSize << ", " + << "largestStaticSize = " << largestStaticSize << ", " + << "totalStaticSize = " << totalStaticSize << ", " + << "maxMul = " << maxMul << ", and " + << "d = " << d << "\n"; +#endif + if( d == 1 ) { +#ifdef _DEBUG + std::cout << "\t\tsuggested blocksize is " << maxMul << "\n"; +#endif + blockLengths[ largestDynamicAxes[ 0 ] ] = maxMul; + } else { + // taking max with 1 is safe since we already know 1, 1, ..., 1 is a sol + const double root = std::max( std::pow( + static_cast< double >(maxMul), + static_cast< double >(1) / static_cast< double >(d) ), + static_cast< double >(1) ); +#ifdef _DEBUG + std::cout << "\t\tinitial suggested blocksize is " << root << "\n"; +#endif + // select solution + size_t sizeTaken = totalStaticSize; + for( const auto &axis : largestDynamicAxes ) { + blockLengths[ axis ] = root; + sizeTaken *= root; + } + // add one until we fill up the buffer: O(d) work + unsigned int incDim = 0; + assert( totalStaticSize > 0 ); + while( sizeTaken + totalStaticSize <= effectiveBufferSize ) { + (void) ++(blockLengths[ largestDynamicAxes[ incDim ] ]); +#ifdef _DEBUG + std::cout << "\t\tblock_length" << largestDynamicAxes[ incDim ] + << "incremented with one\n"; +#endif + sizeTaken += totalStaticSize; + (void) ++incDim; + if( incDim % largestDynamicAxes.size() == 0 ) { + assert( sizeTaken + totalStaticSize > effectiveBufferSize ); + } + } + } +#ifdef _DEBUG + std::cout << "\t\tWill return the following solution:\n"; + for( unsigned int i = 0; i < problem_order; ++i ) { + std::cout << "\t\t\tblock_length" << i << " = " + << blockLengths[ i ] << "\n"; + } +#endif + } + + void binarySearch() { + if( !feasible() ) { + // only in this case we need to compute a non-trivial block length + // we follow a greedy approach where we increase the dimension of the + // blocking only if blocking in one direction was not feasible + unsigned int dim = 1; + std::array< size_t, problem_order > loSizes; + std::array< size_t, problem_order > curSizes; + std::array< size_t, problem_order > hiSizes; + bool foundFeasible = false; + std::array< size_t, problem_order > lastFeasible; + // NOTE this finds the asymptotic optimum if there's one iteration axis + // TODO work out the model in multiple dimensions + while( !foundFeasible ) { + // set up binary search + assert( dim <= largestDynamicAxes.size() ); + for( unsigned int i = 0; i < dim; ++i ) { + const size_t size = problemSizes[ largestDynamicAxes[ i ] ]; + loSizes[ i ] = 1; +#ifdef _DEBUG + std::cout << "\tproblemSizes[ " << i << " ] = " << problemSizes[ i ] + << "\n"; +#endif + curSizes[ i ] = std::max( 1ul, size / 2 ); + hiSizes[ i ] = size; + blockLengths[ i ] = 1; + } + // start binary search + bool converged = false; + while( !converged ) { +#ifdef _DEBUG + for( unsigned int i = 0; i < dim; ++i ) { + std::cout << "\tcurrent search: " << loSizes[ i ] << ", " + << curSizes[ i ] << ", " << hiSizes[ i ] << "\n"; + } +#endif + // active & evaluate current guess + bool notFeasible = true; + { + unsigned int curDim = 0; + for( const auto &dyn_axis : largestDynamicAxes ) { + blockLengths[ dyn_axis ] = curSizes[ curDim ]; + (void) ++curDim; + if( curDim >= dim ) { break; } + } + notFeasible = !feasible(); + } + // update search direction + const std::array< size_t, problem_order > lastCur = curSizes; + if( notFeasible ) { + // mid point is not feasible, update hi and cur + for( unsigned int i = 0; i < dim; ++i ) { + hiSizes[ i ] = curSizes[ i ]; + curSizes[ i ] = std::max( 1ul, + (hiSizes[ i ] - loSizes[ i ]) / 2 + loSizes[ i ] ); + } + } else { + foundFeasible = true; + lastFeasible = curSizes; + // mid point is feasible, update lo and cur + for( unsigned int i = 0; i < dim; ++i ) { + loSizes[ i ] = curSizes[ i ]; + curSizes[ i ] = std::max( 1ul, + (hiSizes[ i ] - loSizes[ i ]) / 2 + loSizes[ i ] ); + } + } + // check convergence + converged = true; + for( unsigned int i = 0; i < dim; ++i ) { + if( lastCur[ i ] != curSizes[ i ] ) { + converged = false; + } + } + } // end binary search + if( !foundFeasible ) { +#ifdef _DEBUG + std::cout << "\tend of binary search without finding any feasible " + << "solution at dim " << dim << "\n"; +#endif + (void) ++dim; + if( dim >= largestDynamicAxes.size() ) { + // This situation should never occur, because the trivial solution of + // blockSize one everywhere should, before calling this function, + // already have been determined to be feasible. + throw std::runtime_error( "Search failed but this situation should " + "never be encountered-- please submit a bug report" ); + } + } + } + // re-activate last found feasible solution + assert( foundFeasible ); + unsigned int curDim = 0; + for( const auto &dyn_axis : largestDynamicAxes ) { + blockLengths[ dyn_axis ] = lastFeasible[ curDim ]; + (void) ++curDim; + if( curDim >= dim ) { break; } + } + assert( feasible() ); + } + } + + void computeBlockLengths() { +#ifdef _DEBUG + std::cout << "\tIn computeBlockLengths()\n" + << "\t\tlargestDynamicAxes.size() = " << largestDynamicAxes.size() << "\n"; +#endif + for( unsigned int i = 0; i < problem_order; ++i ) { + blockLengths[ i ] = 1; + } + if( !feasible() ) { + throw std::runtime_error( "Operator cannot be executed for the given " + "problem sizes." ); + } + std::vector< unsigned int > activeProcIDs; // TODO FIXME remove dependence on std::vector (for performance) + unsigned int procGridDim = 0; + for( unsigned int i = 0; i < process_order; ++i ) { + assert( processSizes[ i ] > 0 ); + if( processSizes[ i ] > 1 ) { + activeProcIDs.push_back( i ); + (void) ++procGridDim; + } + } + if( procGridDim > largestDynamicAxes.size() ) { + // we need to reduce the process mesh + // we just alternate between expanding the first + // largestDynamicAxes mesh sizes + unsigned int curProcInd = 0; + for( unsigned int i = largestDynamicAxes.size(); i < procGridDim; ++i ) { + processSizes[ curProcInd ] *= processSizes[ i ]; + processSizes[ i ] = 1; + (void) ++curProcInd; + if( curProcInd % procGridDim == 0 ) { + curProcInd = 0; + } + } + } + // compute effective dynamic sizes + for( const auto &dyn_axis : largestDynamicAxes ) { + const size_t n = problemSizes[ dyn_axis ]; + const size_t p = processSizes[ dyn_axis ]; + if( n % p == 0 ) { + problemSizes[ dyn_axis ] = n / p; + } else { + problemSizes[ dyn_axis ] = n / p + 1; + } + } + // check for trivial solution + for( const auto &dyn_axis : largestDynamicAxes ) { +#ifdef _DEBUG + std::cout << "\tSetting blockLengths[ " << dyn_axis << " ] to " + << problemSizes[ dyn_axis ] << "\n"; +#endif + blockLengths[ dyn_axis ] = problemSizes[ dyn_axis ]; + } + if( !feasible() ) { + // choose between solution strategy + if( force_binary || (problem_order > 1 && has_differing_dyn_axes) ) { + binarySearch(); + } else { + analyticSolve(); + } + } + + // done + lock = true; + } + + + public: + + /** + * After successful creation, the analytic model is \em unlocked, meaning + * information of the pipeline may be ingested. + * + * TODO: the analytic model currently takes a single scratchpad size, + * \a spsize. But probably it should take two: one for the vector + * unit, and one for the tensor unit. + */ + AnalyticModel( + const size_t spSize, + std::array< size_t, process_order > procSizes, + std::array< size_t, problem_order > probSizes, + std::array< bool, problem_order > iterAxes + ) : + totalSpace( spSize ), + processSizes( std::move( procSizes ) ), + problemSizes( std::move( probSizes ) ), + iterationAxes( std::move( iterAxes ) ), + largestSize( 0 ), largestStaticSize( 0 ), totalStaticSize( 0 ), + bufferSize( 0 ), numStages( 0 ), + lock( false ) + { + nDynamicAxes = 0; + for( unsigned int i = 0; i < problem_order; ++i ) { + if( iterationAxes[ i ] ) { + (void) ++nDynamicAxes; + } + blockLengths[ i ] = 0; + } + } + + /** + * Registers a buffer required by the pipeline. + * + * Buffers are not allowed to have dynamic dimensions. + * + * \warning This function does not check for violation of this requirement. + */ + void addBuffer( + const size_t elemSize, + const std::array< bool, problem_order > &tensor + ) noexcept { + assert( !lock ); + size_t curSize = elemSize; + for( unsigned int i = 0; i < problem_order; ++i ) { + if( tensor[ i ] ) { + curSize *= problemSizes[ i ]; + } + } + bufferSize += curSize; + } + + /** + * Registers a general tensor required by the pipeline. + * + * The given tensor is guaranteed smaller than some other tensor that has + * been, or will be, passed to #addGlobalTensor. + */ + void addMinorTensor( + const size_t elemSize, + const std::array< bool, problem_order > &tensor + ) noexcept { + assert( !lock ); + size_t staticSize = elemSize; + std::vector< unsigned int > dynamicAxes; + for( size_t i = 0; i < problem_order; ++i ) { + if( tensor[ i ] ) { + if( iterationAxes[ i ] ) { + dynamicAxes.push_back( i ); + } else { + staticSize *= problemSizes[ i ]; + } + } + } + totalStaticSize += staticSize; + tensors.push_back( std::make_pair( dynamicAxes, staticSize ) ); +#ifdef _DEBUG + std::cout << "Added minor tensor with " << elemSize << "-byte elements, " + << dynamicAxes.size() << " dynamic axes, and a static size of " + << staticSize << " bytes.\n"; +#endif + } + + /** + * Registers a general tensor required by the pipeline. + */ + void addGlobalTensor( + const size_t elemSize, + const std::array< bool, problem_order > &tensor + ) { + assert( !lock ); + size_t staticSize = elemSize; + std::vector< unsigned int > dynamicAxes; + for( size_t i = 0; i < problem_order; ++i ) { + if( tensor[ i ] ) { + if( iterationAxes[ i ] ) { + dynamicAxes.push_back( i ); + } else { + staticSize *= problemSizes[ i ]; + } + } + } + totalStaticSize += staticSize; + tensors.push_back( std::make_pair( dynamicAxes, staticSize ) ); + size_t globalSize = staticSize; + for( const unsigned int &axis : dynamicAxes ) { + globalSize *= problemSizes[ axis ]; + } +#ifdef _DEBUG + std::cout << "\tadded global tensor with elements of " << elemSize + << " bytes, with a globalSize of " << globalSize + << " bytes, while the current largest size is " << largestSize + << ", and #dynamic axes is " << dynamicAxes.size() + << "\n"; +#endif + if( globalSize > largestSize ) { + largestDynamicAxes = std::move( dynamicAxes ); + largestSize = globalSize; + largestStaticSize = staticSize; + } + } + + /** + * This is actually a place-holder for a mechanism that gives the analytic + * model more precise information on the stages in the pipeline. Rationale + * on why this is needed: some stages (AscendC operators) require work space + * buffers. + * + * @param[in] n The number of stages in the pipeline. + */ + void setNumStages( const size_t n ) { + numStages = n; + } + + /** + * Computes the block sizes suggested by the analytic model. + * + * Locks the analytic model. + */ + size_t getBlockSize( const unsigned int axis ) { + if( !lock ) { + computeBlockLengths(); + } + return blockLengths[ axis ]; + } + + }; + +} + +#endif + diff --git a/include/asclib/ascendlib.hpp b/include/asclib/ascendlib.hpp new file mode 100644 index 000000000..51059139e --- /dev/null +++ b/include/asclib/ascendlib.hpp @@ -0,0 +1,348 @@ +#include + +using namespace AscendC; + +namespace alp { + + __aicore__ inline int32_t max( const int32_t a, const int32_t b ) { + if( a > b) { + return a; + } + return b; + } + + __aicore__ inline int32_t RoundUp(int32_t a, int32_t b) { + return (a + b - 1) / b; + } + + __aicore__ inline int32_t computeBufferSize( const uint32_t max_n, const uint32_t data_size ) + { + // Initializing data required by temporary Tensors + int32_t ascend_el_per_blk = ONE_BLK_SIZE / data_size; + int32_t elementsPerRepeat = ONE_REPEAT_BYTE_SIZE / data_size; + int32_t firstMaxRepeat = max_n / elementsPerRepeat; + int32_t iter1OutputCount = firstMaxRepeat * 2; + int32_t tmpBufsColsReduce = RoundUp( iter1OutputCount, ascend_el_per_blk ) * ascend_el_per_blk; + int32_t totWorkSpaceSize = ( ascend_el_per_blk + tmpBufsColsReduce + max_n ); + + return totWorkSpaceSize; + } + + template< typename T3 = half, typename T1, typename T2 > + __aicore__ inline void DataMove( + T1 tensorOut, + T2 tensorIn, + const uint32_t blocklen + ) { + DataCopy( tensorOut, tensorIn, blocklen ); + } + + template< typename T3 = half, typename T1, typename T2 > + __aicore__ inline void DataMove( + T1 tensorOut, + T2 tensorIn, + const uint32_t nblocks, + const uint32_t blocklen, + const uint32_t src_stride, + const uint32_t dst_stride + ) { + DataCopyParams dcp; + dcp.blockCount = nblocks; + dcp.blockLen = sizeof( T3 ) * blocklen / 32 ; + dcp.srcStride = sizeof( T3 ) * ( src_stride - blocklen ) / 32; + dcp.dstStride = sizeof( T3 ) * ( dst_stride - blocklen ) / 32; + DataCopy( tensorOut, tensorIn, dcp ); + } + + // Bock (matrix) versions + + __aicore__ inline void BlockSet( + AscendC::LocalTensor< half > tensorOut, + half value, + const uint32_t nblocks, + const uint32_t blocklen + ) { + Duplicate( tensorOut, value, nblocks * blocklen ); + } + + __aicore__ inline void BlockSet( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorIn, + const uint32_t nblocks, + const uint32_t blocklen + ) { + DataCopy( tensorOut, tensorIn, nblocks * blocklen ); + } + + __aicore__ inline void BlockExp( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorIn, + const uint32_t nblocks, + const uint32_t blocklen + ) { + for( uint32_t k = 0; k < nblocks ; ++k ) { + Exp( tensorOut[ k * blocklen ], tensorIn[ k * blocklen ], blocklen ); + } + } + + __aicore__ inline void BlockReduceSum( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorIn, + AscendC::LocalTensor< half > Work, + const uint32_t nblocks, + const uint32_t blocklen + ) { + // for( uint32_t k = 0; k < nblocks ; ++k ) { + // ReduceSum( tensorOut[ k ], tensorIn[ k * blocklen ], Work, blocklen ); + // } + uint32_t repeat = nblocks; + uint32_t srcRepStride = blocklen; + srcRepStride = ( sizeof( half ) * srcRepStride ) / 32; + uint32_t nr = repeat/255; + if( repeat % 255 ) nr++; + for( uint32_t ir = 0; ir < nr ; ++ir ) { + uint32_t locrepeat = 255; + if( ir == nr - 1 ) locrepeat = repeat - ir * 255; + WholeReduceSum( + tensorOut[ ir * 255 ], + tensorIn[ ir * 255 * blocklen ], + blocklen, // mask + locrepeat, // repeat + 1, // dstStride + 1, // srcBlkStride + srcRepStride// srcRepStride + ); + } + + } + + __aicore__ inline void BlockReduceMax( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorIn, + AscendC::LocalTensor< half > Work, + const uint32_t nblocks, + const uint32_t blocklen + ) { +#ifdef ASCEND910B + uint32_t repeat = nblocks; + uint32_t srcRepStride = blocklen; + srcRepStride = ( sizeof( half ) * srcRepStride ) / 32; + uint32_t nr = repeat/255; + if( repeat % 255 ) nr++; + for( uint32_t ir = 0; ir < nr ; ++ir ) { + uint32_t locrepeat = 255; + if( ir == nr - 1 ) locrepeat = repeat - ir * 255; + WholeReduceMax( + tensorOut[ ir * 255 ], + tensorIn[ ir * 255 * blocklen ], + blocklen, // mask + locrepeat, // repeat + 1, // dstStride + 1, // srcBlkStride + srcRepStride, // srcRepStride + ReduceOrder::ORDER_ONLY_VALUE + ); + } +#else + // TODO replace with better + for( uint32_t k = 0; k < nblocks ; ++k ) { + ReduceMax( tensorOut[ k ], tensorIn[ k * blocklen ], Work, blocklen ); + } +#endif + } + + __aicore__ inline void BlockBcastMinus( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + AscendC::LocalTensor< half > Work, + const uint32_t nblocks, + const uint32_t blocklen + ) { + for( uint32_t k = 0; k < nblocks ; ++k ) { + Duplicate( Work, tensorInB[ k ].GetValue( 0 ), blocklen ); // broadcast + Sub( tensorOut[ k * blocklen ], tensorInA[ k * blocklen ], Work, blocklen ); + } + } + + __aicore__ inline void BlockEwiseMinus( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + const uint32_t nblocks, + const uint32_t blocklen + ) { + Sub( tensorOut, tensorInA, tensorInB, nblocks * blocklen ); + } + + __aicore__ inline void BlockEwiseSum( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + const uint32_t nblocks, + const uint32_t blocklen + ) { + Add( tensorOut, tensorInA, tensorInB, nblocks * blocklen ); + } + + __aicore__ inline void BlockEwiseMax( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + const uint32_t nblocks, + const uint32_t blocklen + ) { + Max( tensorOut, tensorInA, tensorInB, nblocks * blocklen ); + } + + __aicore__ inline void BlockBcastDivide( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + AscendC::LocalTensor< half > Work, + const uint32_t nblocks, + const uint32_t blocklen + ) { + for( uint32_t k = 0; k < nblocks ; ++k ) { + Duplicate( Work, tensorInB[ k ].GetValue( 0 ), blocklen ); // broadcast + Div( tensorOut[ k * blocklen ], tensorInA[ k * blocklen ], Work, blocklen ); + } + } + + __aicore__ inline void BlockBcastMultiply( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + AscendC::LocalTensor< half > Work, + const uint32_t nblocks, + const uint32_t blocklen + ) { + for( uint32_t k = 0; k < nblocks ; ++k ) { + Duplicate( Work, tensorInB[ k ].GetValue( 0 ), blocklen ); // broadcast + Mul( tensorOut[ k * blocklen ], tensorInA[ k * blocklen ], Work, blocklen ); + } + } + + __aicore__ inline void BlockEwiseMultiply( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + const uint32_t nblocks, + const uint32_t blocklen + ) { + Mul( tensorOut, tensorInA, tensorInB, nblocks * blocklen ); + } + + // Vector versions + + __aicore__ inline void VectorSet( + AscendC::LocalTensor< half > tensorOut, + half value, + const uint32_t blocklen + ) { + Duplicate( tensorOut, value, blocklen ); + } + + __aicore__ inline void VectorSet( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorIn, + const uint32_t blocklen + ) { + DataCopy( tensorOut, tensorIn, blocklen ); + } + + __aicore__ inline void VectorExp( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorIn, + const uint32_t blocklen + ) { + Exp( tensorOut, tensorIn, blocklen ); + } + + __aicore__ inline void VectorReduceSum( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorIn, + AscendC::LocalTensor< half > Work, + const uint32_t blocklen + ) { + ReduceSum( tensorOut, tensorIn, Work, blocklen ); + } + + __aicore__ inline void VectorReduceMax( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorIn, + AscendC::LocalTensor< half > Work, + const uint32_t blocklen + ) { + ReduceMax( tensorOut, tensorIn, Work, blocklen ); + } + + __aicore__ inline void VectorBcastMinus( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + AscendC::LocalTensor< half > Work, + const uint32_t blocklen + ) { + Duplicate( Work, tensorInB.GetValue( 0 ), blocklen ); // broadcast + Sub( tensorOut, tensorInA, Work, blocklen ); + } + + __aicore__ inline void VectorEwiseMinus( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + const uint32_t blocklen + ) { + Sub( tensorOut, tensorInA, tensorInB, blocklen ); + } + + __aicore__ inline void VectorEwiseSum( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + const uint32_t blocklen + ) { + Add( tensorOut, tensorInA, tensorInB, blocklen ); + } + + __aicore__ inline void VectorEwiseMax( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + const uint32_t blocklen + ) { + Max( tensorOut, tensorInA, tensorInB, blocklen ); + } + + __aicore__ inline void VectorBcastDivide( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + AscendC::LocalTensor< half > Work, + const uint32_t blocklen + ) { + Duplicate( Work, tensorInB.GetValue( 0 ), blocklen ); // broadcast + Div( tensorOut, tensorInA, Work, blocklen ); + } + + __aicore__ inline void VectorBcastMultiply( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + AscendC::LocalTensor< half > Work, + const uint32_t blocklen + ) { + Duplicate( Work, tensorInB.GetValue( 0 ), blocklen ); // broadcast + Mul( tensorOut, tensorInA, Work, blocklen ); + } + + __aicore__ inline void VectorEwiseMultiply( + AscendC::LocalTensor< half > tensorOut, + AscendC::LocalTensor< half > tensorInA, + AscendC::LocalTensor< half > tensorInB, + const uint32_t blocklen + ) { + Mul( tensorOut, tensorInA, tensorInB, blocklen ); + } +} diff --git a/include/graphblas/ascend/alloc.hpp b/include/graphblas/ascend/alloc.hpp new file mode 100644 index 000000000..d31123580 --- /dev/null +++ b/include/graphblas/ascend/alloc.hpp @@ -0,0 +1,65 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Allocator functions for the Ascend backend + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ALLOC_ASCEND +#define _H_GRB_ALLOC_ASCEND + +#include + +#include + +#include "config.hpp" + + +namespace grb { + + namespace utils { + + namespace internal { + + template<> + class Allocator< ascend > { + + private: + + /** Prevent initialisation. */ + Allocator(); + + public: + + /** Refer to the standard allocation mechanism. */ + typedef AllocatorFunctions< reference > functions; + + }; + + } // namespace internal + + } // namespace utils + +} // namespace grb + +#endif + diff --git a/include/graphblas/ascend/benchmark.hpp b/include/graphblas/ascend/benchmark.hpp new file mode 100644 index 000000000..0b1835671 --- /dev/null +++ b/include/graphblas/ascend/benchmark.hpp @@ -0,0 +1,95 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Ascend implementation of the benchmarker. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_BENCH +#define _H_GRB_ASCEND_BENCH + +#include +#include + +#include "exec.hpp" + + +namespace grb { + + /** + * The Benchmarker class is based on that of the reference backend + * + * \internal The public API simply wraps the reference Benchmarker. + */ + template< enum EXEC_MODE mode > + class Benchmarker< mode, ascend > { + + private: + + /** \internal Reuse reference benchmarker. */ + Benchmarker< mode, reference > ref; + + + public: + + /** \internal Mirror reference constructor. */ + Benchmarker( + size_t process_id = 0, + size_t nprocs = 1, + std::string hostname = "localhost", + std::string port = "0" + ) : + ref(process_id, nprocs, hostname, port) + {} + + /** \internal Mirror reference exec. */ + template< typename U > + RC exec( + void ( *grb_program )( const void *, const size_t, U & ), + const void * data_in, const size_t in_size, + U &data_out, + const size_t inner, const size_t outer, + const bool broadcast = false + ) const { + return ref.exec( + grb_program, data_in, in_size, data_out, inner, outer, broadcast + ); + } + + /** \internal Mirror reference exec. */ + template< typename T, typename U > + RC exec( + void ( *grb_program )( const T &, U & ), + const T &data_in, U &data_out, + const size_t inner, + const size_t outer, + const bool broadcast = false + ) { + return ref.exec( grb_program, data_in, data_out, inner, outer, broadcast ); + } + + }; + +} // namespace grb + +#endif // end ``_H_GRB_ASCEND_BENCH'' + diff --git a/include/graphblas/ascend/blas1.hpp b/include/graphblas/ascend/blas1.hpp new file mode 100644 index 000000000..e10bffeeb --- /dev/null +++ b/include/graphblas/ascend/blas1.hpp @@ -0,0 +1,11500 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Level-1 primitive implementation for Ascend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_BLAS1 +#define _H_GRB_ASCEND_BLAS1 + +#include //for printing to stderr +#include //for std::enable_if + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "coordinates.hpp" +#include "vector.hpp" +#include "vector_wrapper.hpp" +#include "boolean_dispatcher_blas1.hpp" + +#define NO_CAST_ASSERT( x, y, z ) \ + static_assert( x, \ + "\n\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* ERROR | " y " " z ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* Possible fix 1 | Remove no_casting from the template parameters " \ + "in this call to " y ".\n" \ + "* Possible fix 2 | Provide a value that matches the expected type.\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" ); + +#define NO_CAST_OP_ASSERT( x, y, z ) \ + static_assert( x, \ + "\n\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* ERROR | " y " " z ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* Possible fix 1 | Remove no_casting from the template parameters " \ + "in this call to " y ".\n" \ + "* Possible fix 2 | For all mismatches in the domains of input " \ + "parameters and the operator domains, as specified in the " \ + "documentation of the function " y ", supply an input argument of " \ + "the expected type instead.\n" \ + "* Possible fix 3 | Provide a compatible operator where all domains " \ + "match those of the input parameters, as specified in the " \ + "documentation of the function " y ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" ); + + +namespace grb { + + namespace internal { + + extern LazyEvaluation le; + + } + +} + +namespace grb { + + /** + * \defgroup BLAS1_NB The Level-1 ALP/GraphBLAS routines -- ascend backend + * + * @{ + */ + + namespace internal { + + template< + bool left, + class Monoid, + typename InputType, + class Coords + > + RC fold_from_vector_to_scalar_dense( + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Vector< InputType, ascend, Coords > &to_fold, + const Monoid &monoid + ) { + const InputType *__restrict__ const raw = internal::getRaw( to_fold ); + + const size_t start = lower_bound; + const size_t end = upper_bound; + + if( start < end ) { + if( left ) { + monoid.getOperator().foldlArray( + thread_local_output, raw + start, end - start ); + } else { + monoid.getOperator().foldrArray( + raw + start, thread_local_output, end - start ); + } + } + assert( false ); + return UNSUPPORTED; + } + + template< + Descriptor descr, + bool masked, + bool left, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_to_fold, + bool already_dense_mask, +#endif + class Monoid, + typename InputType, + typename MaskType, + class Coords + > + RC fold_from_vector_to_scalar_vectorDriven( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_to_fold, + bool already_dense_mask, +#endif + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_to_fold, + const Coords &local_mask, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ) { + const size_t n = internal::getCoordinates( to_fold ).size(); + const size_t local_n = upper_bound - lower_bound; + const size_t local_to_fold_nz = ( already_dense_input_to_fold ) + ? local_n + : local_to_fold.nonzeroes(); + + assert( n > 0 ); + assert( !masked || internal::getCoordinates( mask ).size() == n ); + +#ifdef NDEBUG + (void) n; + (void) local_n; +#endif + + RC ret = SUCCESS; + + const size_t start = 0; + const size_t end = local_to_fold_nz; + + // compute thread-local partial reduction + for( size_t k = start; k < end; ++k ) { + const size_t i = ( (already_dense_input_to_fold) + ? k + : local_to_fold.index( k ) ) + lower_bound; + if( masked ) { + if( already_dense_mask ) { + if( !utils::interpretMask< descr >( + internal::getCoordinates( mask ).assigned( i ), + internal::getRaw( mask ), i ) + ) { + continue; + } + } else { + if( !utils::interpretMask< descr >( + local_mask.assigned( i - lower_bound ), internal::getRaw( mask ), i ) + ) { + continue; + } + } + } + RC local_rc; + if( left ) { + local_rc = foldl< descr >( thread_local_output, + internal::getRaw( to_fold )[ i ], monoid.getOperator() ); + } else { + local_rc = foldr< descr >( internal::getRaw( to_fold )[ i ], + thread_local_output, monoid.getOperator() ); + } + assert( local_rc == SUCCESS ); + if( local_rc != SUCCESS ) { + ret = local_rc; + } + } + + return ret; + } + + template< + Descriptor descr, + bool left, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_to_fold, + bool already_dense_mask, +#endif + class Monoid, + typename InputType, + typename MaskType, + class Coords + > + RC fold_from_vector_to_scalar_maskDriven( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_to_fold, + bool already_dense_mask, +#endif + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_to_fold, + const Coords &local_mask, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ) { + const size_t n = internal::getCoordinates( to_fold ).size(); + + assert( internal::getCoordinates( mask ).size() == n ); + assert( n > 0 ); +#ifdef NDEBUG + (void) n; +#endif + const size_t local_n = upper_bound - lower_bound; + const size_t local_mask_nz = ( already_dense_mask ) + ? local_n + : local_mask.nonzeroes(); + + RC ret = SUCCESS; + + const size_t start = 0; + const size_t end = local_mask_nz; + + // compute thread-local partial reduction + for( size_t k = start; k < end; ++k ) { + const size_t i = ( (already_dense_mask) + ? k + : local_mask.index( k ) + ) + lower_bound; + if( !( already_dense_input_to_fold || + local_to_fold.assigned( i - lower_bound ) ) + ) { + continue; + } + if( !utils::interpretMask< descr >( true, internal::getRaw( mask ), i ) ) { + continue; + } + RC local_rc; + if( left ) { + local_rc = foldl< descr >( thread_local_output, + internal::getRaw( to_fold )[ i ], monoid.getOperator() ); + } else { + local_rc = foldr< descr >( internal::getRaw( to_fold )[ i ], + thread_local_output, monoid.getOperator() ); + } + assert( local_rc == SUCCESS ); + if( local_rc != SUCCESS ) { + ret = local_rc; + } + } + + return ret; + } + + template< + Descriptor descr, + bool masked, + bool left, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_to_fold, + bool already_dense_mask, +#endif + class Monoid, + typename InputType, + typename MaskType, + class Coords + > + RC fold_from_vector_to_scalar_fullLoopSparse( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_to_fold, + bool already_dense_mask, +#endif + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_to_fold, + const Coords &local_mask, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ) { +#ifdef _DEBUG + std::cout << "Entered fold_from_vector_to_scalar_fullLoopSparse\n"; +#endif + +#ifndef NDEBUG + const size_t n = internal::getCoordinates( to_fold ).size(); + const size_t local_n = already_dense_input_to_fold + ? upper_bound - lower_bound + : local_to_fold.size(); + assert( local_n > 0 ); + + (void) n; +#endif + RC ret = SUCCESS; + + size_t i = lower_bound; + const size_t end = upper_bound; + + // some sanity checks + assert( i <= end ); + assert( end <= n ); + + // assume current i needs to be processed, forward until we find an index + // for which the mask evaluates true + bool process_current_i = true; + if( masked && i < end ) { + process_current_i = utils::interpretMask< descr >( + already_dense_mask + ? internal::getCoordinates( mask ).assigned( i ) + : local_mask.assigned( i - lower_bound ), + internal::getRaw( mask ), i ) && ( + already_dense_input_to_fold || local_to_fold.assigned( i - lower_bound ) + ); + // if not + while( !process_current_i ) { + // forward to next element + (void) ++i; + // check that we are within bounds + if( i == end ) { + break; + } + // evaluate whether we should process this i-th element + process_current_i = utils::interpretMask< descr >( + already_dense_mask + ? internal::getCoordinates( mask ).assigned( i ) + : local_mask.assigned( i - lower_bound ), + internal::getRaw( mask ), i ) && ( + already_dense_input_to_fold || local_to_fold.assigned( i - lower_bound ) + ); + } + } + + if( !masked && i < end ) { + process_current_i = local_to_fold.assigned( i - lower_bound ); + while( !process_current_i ) { + (void) ++i; + if( i == end ) { + break; + } + process_current_i = already_dense_input_to_fold || + local_to_fold.assigned( i - lower_bound ); + } + } + +#ifndef NDEBUG + if( i < end ) { + assert( i < n ); + } +#endif + + // declare thread-local variable and set our variable to the first value in + // our block + typename Monoid::D3 local = + monoid.template getIdentity< typename Monoid::D3 >(); + if( end > 0 ) { + if( i < end ) { +#ifdef _DEBUG + std::cout << "\t processing start index " << i << "\n"; +#endif + local = static_cast< typename Monoid::D3 >( + internal::getRaw( to_fold )[ i ] ); + } + } + + // if we have more values to fold + if( i + 1 < end ) { + + // keep going until we run out of values to fold + while( true ) { + + // forward to next variable + (void) ++i; + + // forward more (possibly) if in the masked case + if( masked && i < end ) { + assert( i < n ); + process_current_i = utils::interpretMask< descr >( + already_dense_mask + ? internal::getCoordinates( mask ).assigned( i ) + : local_mask.assigned( i - lower_bound ), + internal::getRaw( mask ), i + ) && ( + already_dense_input_to_fold || + local_to_fold.assigned( i - lower_bound ) + ); + while( !process_current_i ) { + (void) ++i; + if( i == end ) { + break; + } + assert( i < end ); + assert( i < n ); + process_current_i = utils::interpretMask< descr >( + already_dense_mask + ? internal::getCoordinates( mask ).assigned( i ) + : local_mask.assigned( i - lower_bound ), + internal::getRaw( mask ), i + ) && ( + already_dense_input_to_fold || + local_to_fold.assigned( i - lower_bound ) + ); + } + } + if( !masked && i < end ) { + assert( i < n ); + process_current_i = already_dense_input_to_fold || + local_to_fold.assigned( i - lower_bound ); + while( !process_current_i ) { + (void) ++i; + if( i == end ) { + break; + } + assert( i < end ); + assert( i < n ); + process_current_i = already_dense_input_to_fold || + local_to_fold.assigned( i - lower_bound ); + } + } + + // stop if past end + if( i >= end ) { + break; + } + +#ifdef _DEBUG + std::cout << "\t processing index " << i << "\n"; +#endif + + // do fold + assert( i < n ); + if( left ) { + ret = ret ? ret : foldl< descr >( local, internal::getRaw( to_fold )[ i ], + monoid.getOperator() ); + } else { + ret = ret ? ret : foldr< descr >( internal::getRaw( to_fold )[ i ], local, + monoid.getOperator() ); + } + assert( ret == SUCCESS ); + + if( ret != SUCCESS ) { + break; + } + } + } + + if( left ) { + ret = ret ? ret : foldl< descr >( thread_local_output, local, + monoid.getOperator() ); + } else { + ret = ret ? ret : foldr< descr >( local, thread_local_output, + monoid.getOperator() ); + } + assert( ret == SUCCESS ); + + return ret; + } + + /** + * Dispatches to any of the four above variants depending on asymptotic cost + * analysis. + */ + template< + Descriptor descr = descriptors::no_operation, + bool masked, + bool left, // if this is false, assumes right-looking fold + class Monoid, + typename IOType, + typename InputType, + typename MaskType, + typename Coords + > + RC fold_from_vector_to_scalar_generic( + IOType &fold_into, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ) { + // static sanity checks + static_assert( grb::is_monoid< Monoid >::value, + "grb::foldl can only be called using monoids. This " + "function should not have been called-- please submit a " + "bugreport." ); + + const size_t n = internal::getCoordinates( to_fold ).size(); + + // mask must be of equal size as input vector + if( masked && n != size( mask ) ) { + return MISMATCH; + } + + // handle trivial cases + if( n == 0 ) { + return SUCCESS; + } + + // some globals used during the folding + RC ret = SUCCESS; + typename Monoid::D3 global = + monoid.template getIdentity< typename Monoid::D3 >(); + + size_t local_reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) * + config::CACHE_LINE_SIZE::value(); + IOType local_reduced[ local_reduced_size ]; + + for( + size_t i = 0; + i < local_reduced_size; + i += config::CACHE_LINE_SIZE::value() + ) { + local_reduced[ i ] = monoid.template getIdentity< typename Monoid::D3 >(); + } + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&to_fold, &mask, &monoid, &local_reduced] ( + internal::Pipeline &pipeline, + const size_t lower_bound, + const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage fold_from_vector_to_scalar_generic " + "in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC ret = SUCCESS; + + Coords local_to_fold, local_mask; + size_t local_n = upper_bound - lower_bound; + size_t local_to_fold_nz = local_n; + size_t local_mask_nz = local_n; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_input_to_fold = true; + bool already_dense_mask = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_to_fold = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( to_fold ) ); + if( !already_dense_input_to_fold ) { +#else + already_dense_input_to_fold = false; +#endif + local_to_fold = internal::getCoordinates( to_fold ).asyncSubset( + lower_bound, upper_bound ); + local_to_fold_nz = local_to_fold.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + if( masked ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( mask ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound, + upper_bound ); + local_mask_nz = local_mask.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + } + + unsigned int thread_id = omp_get_thread_num() * + config::CACHE_LINE_SIZE::value(); + + // dispatch, dense variant + if( ( (descr & descriptors::dense) || local_to_fold_nz == local_n ) && ( + !masked || ( + (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && + local_mask_nz == local_n + ) + ) + ) { +#ifdef _DEBUG + std::cout << "\t dispatching to dense variant\n"; +#endif + ret = fold_from_vector_to_scalar_dense< left >( + local_reduced[ thread_id ], lower_bound, upper_bound, to_fold, monoid ); + } else if( masked && (descr & descriptors::invert_mask ) ) { + // in this case we are forced to dispatch to O(n) +#ifdef _DEBUG + std::cout << "\t forced dispatch to O(n) sparse variant\n"; +#endif + +#ifdef GRB_BOOLEAN_DISPATCHER + ret = boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse< +#else + ret = fold_from_vector_to_scalar_fullLoopSparse< +#endif + descr, true, left + >( + already_dense_input_to_fold, already_dense_mask, + local_reduced[ thread_id ], lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } else { + constexpr const size_t threeWs = + sizeof( typename Coords::StackType ) + + sizeof( typename Coords::ArrayType ) + + MaskWordSize< descr, MaskType >::value; + const size_t fullLoop = masked + ? 2 * sizeof( typename Coords::ArrayType ) * local_n + + sizeof( MaskType ) * local_mask_nz + : sizeof( typename Coords::ArrayType ) * local_n; + const size_t vectorLoop = masked + ? threeWs * local_to_fold_nz + : sizeof( typename Coords::StackType ) * local_to_fold_nz; + const size_t maskLoop = masked + ? threeWs * local_mask_nz + : std::numeric_limits< size_t >::max(); + if( fullLoop >= vectorLoop && maskLoop >= vectorLoop ) { +#ifdef _DEBUG + std::cout << "\t dispatching to vector-driven sparse variant\n"; +#endif + +#ifdef GRB_BOOLEAN_DISPATCHER + ret = boolean_dispatcher_fold_from_vector_to_scalar_vectorDriven< +#else + ret = fold_from_vector_to_scalar_vectorDriven< +#endif + descr, masked, left + >( + already_dense_input_to_fold, already_dense_mask, + local_reduced[ thread_id ], lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } else if( vectorLoop >= fullLoop && maskLoop >= fullLoop ) { +#ifdef _DEBUG + std::cout << "\t dispatching to O(n) sparse variant\n"; +#endif + +#ifdef GRB_BOOLEAN_DISPATCHER + ret = boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse< +#else + ret = fold_from_vector_to_scalar_fullLoopSparse< +#endif + descr, masked, left + >( + already_dense_input_to_fold, already_dense_mask, + local_reduced[ thread_id ], lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } else { + assert( maskLoop < fullLoop && maskLoop < vectorLoop ); + assert( masked ); +#ifdef _DEBUG + std::cout << "\t dispatching to mask-driven sparse variant\n"; +#endif + +#ifdef GRB_BOOLEAN_DISPATCHER + ret = boolean_dispatcher_fold_from_vector_to_scalar_maskDriven< +#else + ret = fold_from_vector_to_scalar_maskDriven< +#endif + descr, left + >( + already_dense_input_to_fold, already_dense_mask, + local_reduced[ thread_id ], lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } + } + + return ret; + }; + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: " + << "fold_from_vector_to_scalar_generic" << std::endl; +#endif + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_VECTOR_SCALAR_GENERIC, + n, + sizeof( IOType ), + dense_descr, + true, + nullptr, nullptr, nullptr, nullptr, + &to_fold, + ( masked ) ? &mask : nullptr, + nullptr, + nullptr, + &internal::getCoordinates( to_fold ), + (masked) ? &internal::getCoordinates( mask ) : nullptr, + nullptr, + nullptr, + nullptr + ); + + if( ret == SUCCESS ) { + for( + size_t i = 0; + i < local_reduced_size; + i += config::CACHE_LINE_SIZE::value() + ) { + RC rc; + if( left ) { + rc = foldl< descr >( global, local_reduced[ i ], monoid.getOperator() ); + } else { + rc = foldr< descr >( local_reduced[ i ], global, monoid.getOperator() ); + } + assert( rc == SUCCESS ); + if( rc != SUCCESS ) { + ret = rc; + } + } + } + + // accumulate +#ifdef _DEBUG + std::cout << "\t accumulating " << global << " into " << fold_into << "\n"; +#endif + + if( ret == SUCCESS ) { + if( left ) { + ret = foldl< descr >( fold_into, global, monoid.getOperator() ); + } else { + ret = foldr< descr >( global, fold_into, monoid.getOperator() ); + } + } + + return ret; + } + + /** + * \internal + * @tparam left If false, right-looking fold is assumed (and left-looking + * otherwise) + * @tparam sparse Whether \a vector was sparse + * @tparam monoid Whether \a op is actually a monoid + * \endinternal + */ + template< + Descriptor descr, + bool left, + bool sparse, + bool masked, + bool monoid, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_output, + bool already_dense_mask, +#endif + typename MaskType, + typename IOType, + typename InputType, + typename Coords, + class OP + > + RC fold_from_scalar_to_vector_generic( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_output, + bool already_dense_mask, +#endif + const size_t lower_bound, + const size_t upper_bound, + Coords &local_vector, + const Coords * const local_mask_ptr, + Vector< IOType, ascend, Coords > &vector, + const Vector< MaskType, ascend, Coords > * const mask, + const InputType &scalar, + const OP &op, + const Phase &phase + ) { + constexpr const bool dense_descr = descr & descriptors::dense; + assert( !masked || mask != nullptr ); + assert( !masked || local_mask_ptr != nullptr ); + + Coords local_mask; + if( masked ) { + local_mask = *local_mask_ptr; + } + + const size_t local_n = upper_bound - lower_bound; + const size_t local_vector_nz = (sparse || !already_dense_output) + ? local_vector.nonzeroes() : local_n; + const size_t local_mask_nz = ( masked ) + ? ( ( already_dense_mask ) + ? local_n + : local_mask.nonzeroes() + ) + : 0; + + const size_t n = internal::getCoordinates( vector ).size(); + + if( masked && internal::getCoordinates( *mask ).size() != n ) { + return MISMATCH; + } + if( dense_descr && sparse ) { + return ILLEGAL; + } + if( n == 0 ) { + return SUCCESS; + } + if( phase == RESIZE ) { + return SUCCESS; + } + + assert( phase == EXECUTE ); + IOType * __restrict__ const x = internal::getRaw( vector ); + const MaskType * __restrict__ const m = ( masked ) + ? internal::getRaw( *mask ) + : nullptr; + + if( sparse && monoid && !masked ) { + for( size_t i = lower_bound; i < upper_bound; ++i ) { + if( already_dense_output || local_vector.assigned( i - lower_bound ) ) { + if( left ) { + (void) foldl< descr >( x[ i ], scalar, op ); + } else { + (void) foldr< descr >( scalar, x[ i ], op ); + } + } else { + x[ i ] = static_cast< IOType >( scalar ); + } + } + + if( !already_dense_output ) { + local_vector.local_assignAllNotAlreadyAssigned(); + } + } else if( sparse && monoid && masked ) { + for( size_t i = 0; i < local_mask_nz; ++i ) { + const size_t index = ( ( already_dense_mask ) + ? i + : local_mask.index( i ) ) + lower_bound; + if( already_dense_mask ) { + if( !internal::getCoordinates( *mask ).template mask< descr >( + index, m ) + ) { + continue; + } + } else { + if( !local_mask.template mask< descr >( index - lower_bound, + m + lower_bound ) + ) { + continue; + } + } + if( already_dense_output || local_vector.assign( index - lower_bound ) ) { + if( left ) { + (void) foldl< descr >( x[ index ], scalar, op ); + } else { + (void) foldr< descr >( scalar, x[ index ], op ); + } + } else { + x[ index ] = static_cast< IOType >( scalar ); + } + } + } else if( sparse && !monoid ) { + const bool maskDriven = masked ? local_mask_nz < local_vector_nz : false; + if( maskDriven ) { + for( size_t i = 0; i < local_mask_nz; ++i ) { + const size_t index = ( ( already_dense_mask ) + ? i + : local_mask.index( i ) ) + lower_bound; + if( already_dense_mask ) { + if( !internal::getCoordinates( *mask ).template mask< descr >( + index, m ) + ) { + continue; + } + } else { + if( !local_mask.template mask< descr >( index - lower_bound, + m + lower_bound ) + ) { + continue; + } + } + if( already_dense_output || local_vector.assign( index - lower_bound ) ) { + if( left ) { + (void) foldl< descr >( x[ index ], scalar, op ); + } else { + (void) foldr< descr >( scalar, x[ index ], op ); + } + } + } + } else { + for( size_t i = 0; i < local_vector_nz; ++i ) { + const size_t index = (already_dense_output + ? i + : local_vector.index( i ) + ) + lower_bound; + if( masked ) { + if( already_dense_mask ) { + if( !( internal::getCoordinates( *mask ).template mask< descr >( + index, m ) ) + ) { + continue; + } + } else { + if( !local_mask.template mask< descr >( index - lower_bound, m + + lower_bound ) + ) { + continue; + } + } + } + if( left ) { + (void) foldl< descr >( x[ index ], scalar, op ); + } else { + (void) foldr< descr >( scalar, x[ index ], op ); + } + } + } + } else if( !sparse && masked ) { + for( size_t i = 0; i < local_mask_nz; ++i ) { + const size_t index = ( ( already_dense_mask ) + ? i + : local_mask.index( i ) ) + lower_bound; + if( already_dense_mask ) { + if( !( internal::getCoordinates( *mask ).template mask< descr >( + index, m ) ) + ) { + continue; + } + } else { + if( !local_mask.template mask< descr >( index - lower_bound, m + + lower_bound ) + ) { + continue; + } + } + + if( left ) { + (void) foldl< descr >( x[ index ], scalar, op ); + } else { + (void) foldr< descr >( scalar, x[ index ], op ); + } + } + } else { + // if target vector is dense and there is no mask, then + // there is no difference between monoid or non-monoid behaviour. + assert( !sparse ); + assert( !masked ); + assert( local_vector_nz == local_n ); + + if( local_n > 0 ) { + if( left ) { + op.eWiseFoldlAS( x + lower_bound, scalar, local_n ); + } else { + op.eWiseFoldrSA( scalar, x + lower_bound, local_n ); + } + } + } + + assert( false ); + return UNSUPPORTED; + } + + template< + Descriptor descr, + bool left, // if this is false, the right-looking fold is assumed + bool sparse, + bool masked, + bool monoid, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_output, + bool already_dense_input_to_fold, + bool already_dense_mask, +#endif + typename MaskType, + typename IOType, + typename IType, + typename Coords, + class OP + > + RC fold_from_vector_to_vector_generic( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_output, + bool already_dense_input_to_fold, + bool already_dense_mask, +#endif + const size_t lower_bound, + const size_t upper_bound, + Coords &local_fold_into, + const Coords * const local_m_ptr, + const Coords &local_to_fold, + Vector< IOType, ascend, Coords > &fold_into, + const Vector< MaskType, ascend, Coords > * const m, + const Vector< IType, ascend, Coords > &to_fold, + const OP &op, + const Phase phase + ) { + constexpr const bool dense_descr = descr & descriptors::dense; + assert( !masked || (m != nullptr) ); + + Coords local_m; + if( masked && !already_dense_mask ) { + local_m = *local_m_ptr; + } + + const size_t local_n = upper_bound - lower_bound; + const size_t local_fold_into_nz = already_dense_output + ? local_n + : local_fold_into.nonzeroes(); + const size_t local_to_fold_nz = already_dense_input_to_fold + ? local_n + : local_to_fold.nonzeroes(); + const size_t local_m_nz = ( masked ) + ? ( already_dense_mask + ? local_n + : local_m.nonzeroes() + ) + : 0; + + const size_t n = size( fold_into ); + if( n != size( to_fold ) ) { + return MISMATCH; + } + if( masked && size( *m ) != n ) { + return MISMATCH; + } + if( dense_descr && sparse ) { + return ILLEGAL; + } + if( phase == RESIZE ) { + return SUCCESS; + } + + assert( phase == EXECUTE ); + + if( !sparse && !masked ) { +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: in dense variant\n"; +#endif + +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: in sequential variant\n"; +#endif + if( left ) { + op.eWiseFoldlAA( internal::getRaw( fold_into ) + lower_bound, + internal::getRaw( to_fold ) + lower_bound, local_n ); + } else { + op.eWiseFoldrAA( internal::getRaw( to_fold ) + lower_bound, + internal::getRaw( fold_into ) + lower_bound, local_n ); + } + } else { +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: in sparse variant\n"; + std::cout << "\tfolding vector of " << local_to_fold_nz << " nonzeroes " + << "into a vector of " << local_fold_into_nz << " nonzeroes...\n"; +#endif + if( + masked && + local_fold_into_nz == local_n && + local_to_fold_nz == local_n + ) { + // use sparsity structure of mask for this eWiseFold + if( left ) { +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: foldl, using the " + << "mask's sparsity structure\n"; +#endif + for( size_t k = 0; k < local_m_nz; ++k ) { + const size_t i = ( already_dense_mask + ? k + : local_m.index( k ) + ) + lower_bound; +#ifdef _DEBUG + std::cout << "Left-folding " << to_fold[ i ] << " into " + << fold_into[ i ]; +#endif + (void) foldl< descr >( fold_into[ i ], to_fold[ i ], op ); +#ifdef _DEBUG + std::cout << " resulting into " << fold_into[ i ] << "\n"; +#endif + } + } else { +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: foldl, using the " + << "mask's sparsity structure\n"; +#endif + for( size_t k = 0; k < local_m_nz; ++k ) { + const size_t i = ( already_dense_mask + ? k + : local_m.index( k ) + ) + lower_bound; +#ifdef _DEBUG + std::cout << "Right-folding " << to_fold[ i ] << " into " + << fold_into[ i ]; +#endif + (void) foldr< descr >( to_fold[ i ], fold_into[ i ], op ); +#ifdef _DEBUG + std::cout << " resulting into " << fold_into[ i ] << "\n"; +#endif + } + } + } else if( !masked && local_fold_into_nz == local_n ) { + // use sparsity structure of to_fold for this eWiseFold + if( left ) { +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: foldl, using " + << "to_fold's sparsity\n"; +#endif + for( size_t k = 0; k < local_to_fold_nz; ++k ) { + const size_t i = ( already_dense_input_to_fold + ? k + : local_to_fold.index( k ) + ) + lower_bound; +#ifdef _DEBUG + std::cout << "Left-folding " << to_fold[ i ] << " into " + << fold_into[ i ]; +#endif + (void) foldl< descr >( fold_into[ i ], to_fold[ i ], op ); +#ifdef _DEBUG + std::cout << " resulting into " << fold_into[ i ] << "\n"; +#endif + } + } else { +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: foldl, using " + << "to_fold's sparsity\n"; +#endif + for( size_t k = 0; k < local_to_fold_nz; ++k ) { + const size_t i = ( already_dense_input_to_fold + ? k + : local_to_fold.index( k ) + ) + lower_bound; +#ifdef _DEBUG + std::cout << "Right-folding " << to_fold[ i ] << " into " + << fold_into[ i ]; +#endif + (void) foldr< descr >( to_fold[ i ], fold_into[ i ], op ); +#ifdef _DEBUG + std::cout << " resulting into " << fold_into[ i ] << "\n"; +#endif + } + } + } else if( !masked && local_to_fold_nz == local_n ) { + // use sparsity structure of fold_into for this eWiseFold + if( left ) { +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: foldl, using " + << "fold_into's sparsity\n"; +#endif + for( size_t k = 0; k < local_fold_into_nz; ++k ) { + const size_t i = ( already_dense_output + ? k + : local_fold_into.index( k ) + ) + lower_bound; +#ifdef _DEBUG + std::cout << "Left-folding " << to_fold[ i ] << " into " + << fold_into[ i ]; +#endif + (void) foldl< descr >( fold_into[ i ], to_fold[ i ], op ); +#ifdef _DEBUG + std::cout << " resulting into " << fold_into[ i ] << "\n"; +#endif + } + } else { +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: foldr, using " + << "fold_into's sparsity\n"; +#endif + for( size_t k = 0; k < local_fold_into_nz; ++k ) { + const size_t i = ( already_dense_output ? + k : + local_fold_into.index( k ) + ) + lower_bound; +#ifdef _DEBUG + std::cout << "Right-folding " << to_fold[ i ] << " into " << fold_into[ i ]; +#endif + (void) foldr< descr >( to_fold[ i ], fold_into[ i ], op ); +#ifdef _DEBUG + std::cout << " resulting into " << fold_into[ i ] << "\n"; +#endif + } + } + } else { +#ifdef _DEBUG + std::cout << "fold_from_vector_to_vector_generic: using specialised " + << "code to merge two sparse vectors and, potentially, " + << "output masks\n"; +#endif + const IType * __restrict__ const tf_raw = internal::getRaw( to_fold ); + IOType * __restrict__ const fi_raw = internal::getRaw( fold_into ); +#ifdef _DEBUG + std::cout << "\tin sequential version...\n"; +#endif + for( size_t k = 0; k < local_to_fold_nz; ++k ) { + const size_t i = ( already_dense_input_to_fold + ? k + : local_to_fold.index( k ) + ) + lower_bound; + if( masked ) { + if( already_dense_mask ) { + if( !internal::getCoordinates( *m ).template mask< descr >( i, + internal::getRaw( *m ) ) + ) { + continue; + } + } else { + if( !local_m.template mask< descr >( i - lower_bound, + internal::getRaw( *m ) + lower_bound ) + ) { + continue; + } + } + } + + assert( i < n ); + if( already_dense_output || + local_fold_into.assigned( i - lower_bound ) + ) { + if( left ) { +#ifdef _DEBUG + std::cout << "\tfoldl< descr >( fi_raw[ i ], tf_raw[ i ], op ), i = " + << i << ": " << tf_raw[ i ] << " goes into " << fi_raw[ i ]; +#endif + (void)foldl< descr >( fi_raw[ i ], tf_raw[ i ], op ); +#ifdef _DEBUG + std::cout << " which results in " << fi_raw[ i ] << "\n"; +#endif + } else { +#ifdef _DEBUG + std::cout << "\tfoldr< descr >( tf_raw[ i ], fi_raw[ i ], op ), i = " + << i << ": " << tf_raw[ i ] << " goes into " << fi_raw[ i ]; +#endif + (void) foldr< descr >( tf_raw[ i ], fi_raw[ i ], op ); +#ifdef _DEBUG + std::cout << " which results in " << fi_raw[ i ] << "\n"; +#endif + } + } else if( monoid ) { +#ifdef _DEBUG + std::cout << "\tindex " << i << " is unset. Old value " << fi_raw[ i ] + << " will be overwritten with " << tf_raw[ i ] << "\n"; +#endif + fi_raw[ i ] = tf_raw[ i ]; + (void) local_fold_into.assign( i - lower_bound ); + } + } + } + } + +#ifdef _DEBUG + std::cout << "\tCall to fold_from_vector_to_vector_generic done. " + << "Output now contains " << local_fold_into_nz << " / " + << local_n << " nonzeroes.\n"; +#endif + assert( false ); + return UNSUPPORTED; + } + + } // namespace internal + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename InputType, + typename IOType, + typename MaskType, + typename Coords + > + RC foldr( + const Vector< InputType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &mask, + IOType &beta, + const Monoid &monoid = Monoid(), + const typename std::enable_if< + !grb::is_object< InputType >::value && + !grb::is_object< IOType >::value && + !grb::is_object< MaskType >::value && + grb::is_monoid< Monoid >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< IOType, InputType >::value ), "grb::foldr", + "called with a scalar IO type that does not match the input vector type" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldr", + "called with an input vector value type that does not match the first " + "domain of the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldr", + "called with an input vector type that does not match the second domain of " + "the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldr", + "called with an input vector type that does not match the third domain of " + "the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::foldr", + "called with a vector mask type that is not boolean" ); + + if( size( mask ) > 0 ) { + return internal::template fold_from_vector_to_scalar_generic< + descr, true, false + >( beta, x, mask, monoid ); + } else { + return internal::template fold_from_vector_to_scalar_generic< + descr, false, false + >( beta, x, mask, monoid ); + } + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename InputType, + typename IOType, + typename Coords + > + RC foldr( + const Vector< InputType, ascend, Coords > &x, + IOType &beta, + const Monoid &monoid = Monoid(), + const typename std::enable_if< + !grb::is_object< InputType >::value && + !grb::is_object< IOType >::value && + grb::is_monoid< Monoid >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< IOType, InputType >::value ), "grb::foldr", + "called with a scalar IO type that does not match the input vector type" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldr", + "called with an input vector value type that does not match the first " + "domain of the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldr", + "called with an input vector type that does not match the second domain of " + "the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldr", + "called with an input vector type that does not match the third domain of " + "the given monoid" ); + + Vector< bool, ascend, Coords > empty_mask( 0 ); + return internal::template fold_from_vector_to_scalar_generic< + descr, false, false + >( beta, x, empty_mask, monoid ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename IOType, + typename InputType, + typename Coords + > + RC foldr( + const InputType &alpha, + Vector< IOType, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< InputType >::value && + !grb::is_object< IOType >::value && + grb::is_monoid< Monoid >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given operator" ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [alpha, &y, &monoid, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldl(alpha, y, monoid) in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_y; + size_t local_y_nz; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + + if( !already_dense_vectors ) { + const size_t local_n = upper_bound - lower_bound; +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( &internal::getCoordinates( y ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, false, true, false, true + >( + already_dense_output, true, + lower_bound, upper_bound, local_y, local_null_mask, + y, null_mask, alpha, monoid.getOperator(), phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, false, false, false, true + >( + already_dense_output, true, + lower_bound, upper_bound, local_y, local_null_mask, + y, null_mask, alpha, monoid.getOperator(), phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC, + internal::getCoordinates( y ).size(), + sizeof( IOType ), + dense_descr, true, + &y, nullptr, + &internal::getCoordinates( y ), nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(alpha, y, monoid)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename IOType, + typename InputType, + typename Coords + > + RC foldr( + const InputType &alpha, + Vector< IOType, ascend, Coords > &y, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< InputType >::value && + !grb::is_object< IOType >::value && + grb::is_operator< OP >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, IOType >::value ), "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType >::value ), "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, IOType >::value ), "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given operator" ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [alpha, &y, &op, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + { + std::cout << "\t\tExecution of stage foldl(alpha, y, op) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; + } +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_y_nz; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + bool already_dense_output = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, false, true, false, false + >( + already_dense_output, true, + lower_bound, upper_bound, + local_y, local_null_mask, y, null_mask, + alpha, op, phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, false, false, false, false + >( + already_dense_output, true, + lower_bound, upper_bound, local_y, local_null_mask, + y, null_mask, alpha, op, phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC, + internal::getCoordinates( y ).size(), + sizeof( IOType ), + dense_descr, true, + &y, nullptr, + &internal::getCoordinates( y ), nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(alpha, y, op)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename IOType, + typename InputType, + typename Coords + > + RC foldr( + const Vector< InputType, ascend, Coords > &x, + Vector< IOType, ascend, Coords > &y, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_operator< OP >::value && + !grb::is_object< InputType >::value && + !grb::is_object< IOType >::value, + void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType >::value ), "grb::eWiseFoldr", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, IOType >::value ), "grb::eWiseFoldr", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, IOType >::value ), "grb::eWiseFoldr", + "called on a vector y of a type that does not match the third domain " + "of the given operator" ); + + const size_t n = size( x ); + if( n != size( y ) ) { + return MISMATCH; + } + +#ifdef _DEBUG + std::cout << "In foldr ([T]<-[T])\n"; +#endif + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &y, &op, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldr(x, y, operator) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_x, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz, local_y_nz; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + bool already_dense_output = true; + bool already_dense_input = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input ) { +#else + already_dense_input = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, false, true, false, false + >( + already_dense_output, already_dense_input, true, + lower_bound, upper_bound, + local_y, local_null_mask, + local_x, y, + null_mask, x, + op, phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, false, false, false, false + >( + already_dense_output, already_dense_input, true, + lower_bound, upper_bound, + local_y, local_null_mask, + local_x, + y, null_mask, + x, + op, phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC, + n, sizeof( IOType ), + dense_descr, true, + getID( y ), + &y, nullptr, + &internal::getCoordinates( y ), nullptr, + getID( x ), SIZE_MAX, SIZE_MAX, SIZE_MAX, + &x, nullptr, nullptr, nullptr, + &internal::getCoordinates( x ), nullptr, nullptr, nullptr, + SIZE_MAX, nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldr(x, y, operator)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename IOType, + typename MaskType, + typename InputType, + typename Coords + > + RC foldr( + const Vector< InputType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + Vector< IOType, ascend, Coords > &y, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< grb::is_operator< OP >::value && + !grb::is_object< InputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< IOType >::value, void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType >::value ), "grb::eWiseFoldr", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, IOType >::value ), "grb::eWiseFoldr", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, IOType >::value ), "grb::eWiseFoldr", + "called on a vector y of a type that does not match the third domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseFoldr", + "called with a non-Boolean mask" ); + + if( size( m ) == 0 ) { + return foldr< descr >( x, y, op, phase ); + } + + const size_t n = size( x ); + if( n != size( y ) || n != size( m ) ) { + return MISMATCH; + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &m, &y, &op, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldr(x, m, y, operator) in the range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_m, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz, local_y_nz; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + bool already_dense_output = true; + bool already_dense_mask = true; + bool already_dense_input = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( m ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_m = internal::getCoordinates( m ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input ) { +#else + already_dense_input = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, false, true, true, false + >( + already_dense_output, already_dense_input, already_dense_mask, + lower_bound, upper_bound, + local_y, &local_m, local_x, + y, &m, x, + op, phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, false, false, true, false + >( + already_dense_output, already_dense_input, already_dense_mask, + lower_bound, upper_bound, + local_y, &local_m, local_x, + y, &m, x, + op, phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC, + n, sizeof( IOType ), + dense_descr, true, + &y, nullptr, &internal::getCoordinates( y ), nullptr, + &x, &m, nullptr, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( m ), + nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldr(x, m, y, operator)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename IOType, + typename InputType, + typename Coords + > + RC foldr( + const Vector< InputType, ascend, Coords > &x, + Vector< IOType, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< grb::is_monoid< Monoid >::value && + !grb::is_object< InputType >::value && + !grb::is_object< IOType >::value, + void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType >::value ), "grb::eWiseFoldr", + "called with a vector x of a type that does not match the first domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, IOType >::value ), "grb::eWiseFoldr", + "called on a vector y of a type that does not match the second domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, IOType >::value ), "grb::eWiseFoldr", + "called on a vector y of a type that does not match the third domain " + "of the given monoid" ); + + // dynamic sanity checks + const size_t n = size( x ); + if( n != size( y ) ) { + return MISMATCH; + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &y, &monoid, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldr(x, y, monoid) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_x, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz, local_y_nz; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + bool already_dense_output = true; + bool already_dense_input = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input ) { +#else + already_dense_input = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, false, true, false, true + >( + already_dense_output, already_dense_input, true, + lower_bound, upper_bound, + local_y, local_null_mask, local_x, + y, null_mask, x, + monoid.getOperator(), phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, false, false, false, true + >( + already_dense_output, already_dense_input, true, + lower_bound, upper_bound, + local_y, local_null_mask, local_x, + y, null_mask, x, + monoid.getOperator(), phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC, + n, sizeof( IOType ), dense_descr, true, + &y, nullptr, &internal::getCoordinates( y ), nullptr, + &x, nullptr, nullptr, nullptr, + &internal::getCoordinates( x ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldr(x, y, monoid)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename IOType, + typename MaskType, + typename InputType, + typename Coords + > + RC foldr( + const Vector< InputType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + Vector< IOType, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< grb::is_monoid< Monoid >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType >::value && + !grb::is_object< IOType >::value, void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType >::value ), "grb::eWiseFoldr", + "called with a vector x of a type that does not match the first domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, IOType >::value ), "grb::eWiseFoldr", + "called on a vector y of a type that does not match the second domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, IOType >::value ), "grb::eWiseFoldr", + "called on a vector y of a type that does not match the third domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseFoldr", + "called with a mask of non-Boolean type" ); + + // check empty mask + if( size( m ) == 0 ) { + return foldr< descr >( x, y, monoid, phase ); + } + + // dynamic sanity checks + const size_t n = size( x ); + if( n != size( y ) || n != size( m ) ) { + return MISMATCH; + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &m, &y, &monoid, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldr(x, m, y, monoid) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_m, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz, local_y_nz; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + bool already_dense_mask = true; + bool already_dense_input = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( m ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_m = internal::getCoordinates( m ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input ) { +#else + already_dense_input = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, false, true, true, true + >( + already_dense_output, already_dense_input, already_dense_mask, + lower_bound, upper_bound, + local_y, &local_m, local_x, + y, &m, x, + monoid.getOperator(), phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, false, false, true, true + >( + already_dense_output, already_dense_input, already_dense_mask, + lower_bound, upper_bound, + local_y, &local_m, local_x, + y, &m, x, + monoid.getOperator(), phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC, + n, sizeof( IOType ), dense_descr, true, + &y, nullptr, &internal::getCoordinates( y ), nullptr, + &x, &m, nullptr, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( m ), + nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldr(x, m, y, monoid)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Op, + typename IOType, + typename InputType, + typename Coords + > + RC foldl( + Vector< IOType, ascend, Coords > &x, + const InputType beta, + const Op &op = Op(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< IOType >::value && + !grb::is_object< InputType >::value && + grb::is_operator< Op >::value, void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Op::D1, IOType >::value ), + "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Op::D2, InputType >::value ), + "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Op::D3, IOType >::value ), + "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given operator" ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, beta, &op, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldl(x, beta, op) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_x; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, true, true, false, false + >( + already_dense_output, true, + lower_bound, upper_bound, + local_x, local_null_mask, + x, null_mask, + beta, + op, phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, true, false, false, false + >( + already_dense_output, true, + lower_bound, upper_bound, + local_x, local_null_mask, + x, null_mask, beta, + op, phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC, + internal::getCoordinates( x ).size(), sizeof( IOType ), + dense_descr, true, + &x, nullptr, + &internal::getCoordinates( x ), nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(x, beta, op)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Op, + typename IOType, + typename MaskType, + typename InputType, + typename Coords + > + RC foldl( + Vector< IOType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + const InputType beta, + const Op &op = Op(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< IOType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType >::value && + grb::is_operator< Op >::value, void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Op::D1, IOType >::value ), + "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Op::D2, InputType >::value ), + "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Op::D3, IOType >::value ), + "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting ) || + std::is_same< bool, MaskType >::value ), + "grb::foldl (reference, vector <- scalar, masked)", + "provided mask does not have boolean entries" ); + + // check empty mask + if( size( m ) == 0 ) { + return foldl< descr >( x, beta, op, phase ); + } + + // dynamic checks + const size_t n = size( x ); + if( size( m ) != n ) { + return MISMATCH; + } + + // catch trivial phase + if( phase == RESIZE ) { + return SUCCESS; + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &m, beta, &op, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldl(x, m, beta, op) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_mask; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + bool already_dense_mask = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( m ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( m ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, true, true, true, false + >( + already_dense_output, already_dense_mask, + lower_bound, upper_bound, + local_x, &local_mask, + x, &m, + beta, + op, phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, true, false, true, false + >( + already_dense_output, already_dense_mask, + lower_bound, upper_bound, + local_x, &local_mask, + x, &m, + beta, + op, phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC, + n, sizeof( IOType ), + dense_descr, true, + &x, nullptr, &internal::getCoordinates( x ), nullptr, + &m, nullptr, nullptr, nullptr, + &internal::getCoordinates( m ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(x, m, beta, op)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename IOType, + typename InputType, + typename Coords + > + RC foldl( + Vector< IOType, ascend, Coords > &x, + const InputType beta, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< IOType >::value && + !grb::is_object< InputType >::value && + grb::is_monoid< Monoid >::value, void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given monoid" ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, beta, &monoid, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldl(x, beta, monoid) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_x; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, true, true, false, true + >( + already_dense_output, true, + lower_bound, upper_bound, + local_x, local_null_mask, + x, null_mask, + beta, + monoid.getOperator(), phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, true, false, false, true + >( + already_dense_output, true, + lower_bound, upper_bound, + local_x, local_null_mask, + x, null_mask, + beta, + monoid.getOperator(), phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_SCALAR_VECTOR_GENERIC, + internal::getCoordinates( x ).size(), sizeof( IOType ), + dense_descr, true, + &x, nullptr, &internal::getCoordinates( x ), nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(x, beta, monoid)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename IOType, + typename MaskType, + typename InputType, + typename Coords + > + RC foldl( + Vector< IOType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + const InputType &beta, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< IOType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType >::value && + grb::is_monoid< Monoid >::value, + void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::foldl (ascend, vector <- scalar, masked, monoid)", + "provided mask does not have boolean entries" ); + + // check for empty mask + if( size( m ) == 0 ) { + return foldl< descr >( x, beta, monoid, phase ); + } + + // dynamic checks + const size_t n = size( x ); + if( n != size( m ) ) { return MISMATCH; } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &m, beta, &monoid, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldl(x, m, beta, monoid) in the " + << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_m; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + bool already_dense_mask = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( m ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_m = internal::getCoordinates( m ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, true, true, true, true + >( + already_dense_output, already_dense_mask, + lower_bound, upper_bound, + local_x, &local_m, + x, &m, + beta, + monoid.getOperator(), phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_scalar_to_vector_generic< +#else + rc = internal::fold_from_scalar_to_vector_generic< +#endif + descr, true, false, true, true + >( + already_dense_output, already_dense_mask, + lower_bound, upper_bound, + local_x, &local_m, + x, &m, + beta, + monoid.getOperator(), phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_MASKED_SCALAR_VECTOR_GENERIC, + internal::getCoordinates( x ).size(), sizeof( IOType ), + dense_descr, true, + &x, nullptr, &internal::getCoordinates( x ), nullptr, + &m, nullptr, nullptr, nullptr, + &internal::getCoordinates( m ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(x, m, beta, monoid)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename IOType, + typename InputType, + typename Coords + > + RC foldl( + Vector< IOType, ascend, Coords > &x, + const Vector< InputType, ascend, Coords > &y, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< grb::is_operator< OP >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType >::value, void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, IOType >::value ), "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType >::value ), "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( (!( descr & descriptors::no_casting) || + std::is_same< typename OP::D3, IOType >::value ), "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given operator" ); + + // dynamic sanity checks + const size_t n = size( x ); + if( n != size( y ) ) { + return MISMATCH; + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &y, &op, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldl(x, y, operator) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_x, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz, local_y_nz; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + bool already_dense_input = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input ) { +#else + already_dense_input = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, true, true, false, false + >( + already_dense_output, already_dense_input, true, + lower_bound, upper_bound, + local_x, local_null_mask, local_y, + x, null_mask, y, + op, phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, true, false, false, false + >( + already_dense_output, already_dense_input, true, + lower_bound, upper_bound, + local_x, local_null_mask, local_y, + x, null_mask, y, + op, phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC, + n, sizeof( IOType ), dense_descr, true, + getID( x ), + &x, nullptr, &internal::getCoordinates( x ), nullptr, + getID( y ), SIZE_MAX, SIZE_MAX, SIZE_MAX, + &y, nullptr, nullptr, nullptr, + &internal::getCoordinates( y ), nullptr, nullptr, nullptr, + SIZE_MAX, nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(x, y, operator)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename IOType, + typename InputType, + typename Coords + > + RC foldl( + Vector< IOType, ascend, Coords > &x, + const Vector< InputType, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< grb::is_monoid< Monoid >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType >::value, void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given operator" ); + + // dynamic sanity checks + const size_t n = size( x ); + if( n != size( y ) ) { + return MISMATCH; + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &y, &monoid, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldl(x, y, monoid) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_x, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz, local_y_nz; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + bool already_dense_input = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input ) { +#else + already_dense_input = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, true, true, false, true + >( + already_dense_output, already_dense_input, true, + lower_bound, upper_bound, + local_x, local_null_mask, local_y, + x, null_mask, y, + monoid.getOperator(), phase + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, true, false, false, true + >( + already_dense_output, already_dense_input, true, + lower_bound, upper_bound, + local_x, local_null_mask, local_y, + x, null_mask, y, + monoid.getOperator(), phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_VECTOR_VECTOR_GENERIC, + n, sizeof( IOType ), dense_descr, true, + &x, nullptr, &internal::getCoordinates( x ), nullptr, + &y, nullptr, nullptr, nullptr, + &internal::getCoordinates( y ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(x, y, monoid)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename IOType, + typename MaskType, + typename InputType, + typename Coords + > + RC foldl( + Vector< IOType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType, ascend, Coords > &y, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< grb::is_operator< OP >::value && + !grb::is_object< IOType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType >::value, void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, IOType >::value ), "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType >::value ), "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, IOType >::value ), "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::foldl", + "called with a mask that does not have boolean entries " ); + + // catch empty mask + if( size( m ) == 0 ) { + return foldl< descr >( x, y, op, phase ); + } + + // dynamic sanity checks + const size_t n = size( x ); + if( n != size( y ) || n != size( m ) ) { + return MISMATCH; + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &m, &y, &op, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldl(x, m, y, op) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_y, local_m; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + size_t local_y_nz = local_n; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + bool already_dense_output = true; + bool already_dense_input = true; + bool already_dense_mask = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( m ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_m = internal::getCoordinates( m ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input ) { +#else + already_dense_input = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, true, true, true, false + >( + already_dense_output, already_dense_input, already_dense_mask, + lower_bound, upper_bound, + local_x, &local_m, local_y, + x, &m, y, + op, phase + ); + } else { + assert( local_x_nz == local_n ); + assert( local_y_nz == local_n ); +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, true, false, true, false + >( + already_dense_output, already_dense_input, already_dense_mask, + lower_bound, upper_bound, + local_x, &local_m, local_y, + x, &m, y, + op, phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC, + n, sizeof( IOType ), dense_descr, true, + &x, nullptr, &internal::getCoordinates( x ), nullptr, + &y, &m, nullptr, nullptr, + &internal::getCoordinates( y ), &internal::getCoordinates( m ), nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(x, m, y, op)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename IOType, + typename MaskType, + typename InputType, + typename Coords + > + RC foldl( + Vector< IOType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< grb::is_monoid< Monoid >::value && + !grb::is_object< IOType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType >::value, void + >::type * = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, IOType >::value ), "grb::foldl", + "called with a vector x of a type that does not match the first domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType >::value ), "grb::foldl", + "called on a vector y of a type that does not match the second domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, IOType >::value ), "grb::foldl", + "called on a vector x of a type that does not match the third domain " + "of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::foldl", + "called with a mask that does not have boolean entries" ); + + // catch empty mask + if( size( m ) == 0 ) { + return foldl< descr >( x, y, monoid, phase ); + } + + // dynamic sanity checks + const size_t n = size( x ); + if( n != size( y ) || n != size( m ) ) { + return MISMATCH; + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &m, &y, &monoid, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage foldl(x, m, y, monoid) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_y, local_m; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + size_t local_y_nz = local_n; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + bool already_dense_output = true; + bool already_dense_input = true; + bool already_dense_mask = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( m ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_m = internal::getCoordinates( m ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input ) { +#else + already_dense_input = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, true, true, true, true + >( + already_dense_output, already_dense_input, already_dense_mask, + lower_bound, upper_bound, + local_x, &local_m, local_y, + x, &m, y, + monoid.getOperator(), phase + ); + } else { + assert( local_x_nz == local_n ); + assert( local_y_nz == local_n ); + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_fold_from_vector_to_vector_generic< +#else + rc = internal::fold_from_vector_to_vector_generic< +#endif + descr, true, false, true, true + >( + already_dense_output, already_dense_input, already_dense_mask, + lower_bound, upper_bound, + local_x, &local_m, local_y, + x, &m, y, + monoid.getOperator(), phase + ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_FOLD_MASKED_VECTOR_VECTOR_GENERIC, + n, sizeof( IOType ), + dense_descr, true, + &x, nullptr, &internal::getCoordinates( x ), nullptr, + &y, &m, nullptr, nullptr, + &internal::getCoordinates( y ), &internal::getCoordinates( m ), + nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: foldl(x, m, y, monoid)" + << std::endl; +#endif + return ret; + } + + namespace internal { + + template< + bool left_scalar, + bool right_scalar, + bool left_sparse, + bool right_sparse, + Descriptor descr, class OP, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_x, + bool already_dense_input_y, +#endif + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC dense_apply_generic( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_x, + bool already_dense_input_y, +#endif + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper, + const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper, + const OP &op + ) { +#ifdef _DEBUG + std::cout << "\t internal::dense_apply_generic called\n"; +#endif + static_assert( !(left_scalar && left_sparse), + "The left-hand side must be scalar OR sparse, but cannot be both!" ); + static_assert( !(right_scalar && right_sparse), + "The right-hand side must be scalar OR sparse, but cannot be both!" ); + static_assert( !(left_sparse && right_sparse), + "If both left- and right-hand sides are sparse, use sparse_apply_generic " + "instead." ); + + // create local copies of the input const pointers + OutputType * __restrict__ const z_p = internal::getRaw( z_vector ); + const InputType1 * __restrict__ x_p = x_wrapper.getRaw(); + const InputType2 * __restrict__ y_p = y_wrapper.getRaw(); + + const size_t local_n = upper_bound - lower_bound; + + constexpr const size_t block_size = OP::blocksize; + const size_t num_blocks = local_n / block_size; + +#ifndef NDEBUG + const bool has_coda = local_n % block_size > 0; +#endif + size_t i = 0 + lower_bound; + const size_t start = 0; + const size_t end = num_blocks; + + // declare and initialise local buffers for SIMD + OutputType z_b[ block_size ]; + InputType1 x_b[ block_size ]; + InputType2 y_b[ block_size ]; + bool x_m[ block_size ]; + bool y_m[ block_size ]; + for( size_t k = 0; k < block_size; ++k ) { + if( left_scalar ) { + x_b[ k ] = x_wrapper.getValue(); + } + if( right_scalar ) { + y_b[ k ] = y_wrapper.getValue(); + } + } + + for( size_t block = start; block < end; ++block ) { + size_t local_i = i; + for( size_t k = 0; k < block_size; ++k ) { + if( !left_scalar ) { + x_b[ k ] = x_p[ local_i ]; + } + if( !right_scalar ) { + y_b[ k ] = y_p[ local_i ]; + } + if( left_sparse ) { + x_m[ k ] = already_dense_input_x || local_x.assigned( local_i - + lower_bound ); + } + if( right_sparse ) { + y_m[ k ] = already_dense_input_y || local_y.assigned( local_i - + lower_bound ); + } + (void) ++local_i; + } + for( size_t k = 0; k < block_size; ++k ) { + RC rc = SUCCESS; + if( left_sparse && !x_m[ k ] ) { + z_b[ k ] = y_b[ k ]; // WARNING: assumes monoid semantics! + } else if( right_sparse && !y_m[ k ] ) { + z_b[ k ] = x_b[ k ]; // WARNING: assumes monoid semantics! + } else { + rc = apply( z_b[ k ], x_b[ k ], y_b[ k ], op ); + } + assert( rc == SUCCESS ); +#ifdef NDEBUG + (void) rc; +#endif + } + for( size_t k = 0; k < block_size; ++k, ++i ) { + z_p[ i ] = z_b[ k ]; + } + } + +#ifndef NDEBUG + if( has_coda ) { + assert( i < local_n + lower_bound ); + } else { + assert( i == local_n + lower_bound ); + } +#endif + + i = end * block_size + lower_bound; + for( ; i < local_n + lower_bound; ++i ) { + RC rc = SUCCESS; + if( left_scalar && right_scalar ) { + rc = apply( z_p[ i ], x_wrapper.getValue(), y_wrapper.getValue(), op ); + } else if( left_scalar && !right_scalar ) { + if( right_sparse && !( already_dense_input_y || local_y.assigned( i - + lower_bound ) ) + ) { + z_p[ i ] = x_wrapper.getValue(); + } else { + rc = apply( z_p[ i ], x_wrapper.getValue(), y_p[ i ], op ); + } + } else if( !left_scalar && right_scalar ) { + if( left_sparse && !( already_dense_input_x || local_x.assigned( i - + lower_bound ) ) + ) { + z_p[ i ] = y_wrapper.getValue(); + } else { + rc = apply( z_p[ i ], x_p[ i ], y_wrapper.getValue(), op ); + } + } else { + assert( !left_scalar && !right_scalar ); + if( left_sparse && !(already_dense_input_x || local_x.assigned( i - + lower_bound ) ) + ) { + z_p[ i ] = y_p[ i ]; + } else if( right_sparse && !(already_dense_input_y || local_y.assigned( i - + lower_bound ) ) + ) { + z_p[ i ] = x_p[ i ]; + } else { + assert( !left_sparse && !right_sparse ); + rc = apply( z_p[ i ], x_p[ i ], y_p[ i ], op ); + } + } + assert( rc == SUCCESS ); +#ifdef NDEBUG + (void) rc; +#endif + } + + assert( false ); + return UNSUPPORTED; + } + + template< + bool masked, + bool monoid, + bool x_scalar, + bool y_scalar, + Descriptor descr, + class OP, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_mask, + bool already_dense_input_x, + bool already_dense_input_y, +#endif + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC sparse_apply_generic( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_mask, + bool already_dense_input_x, + bool already_dense_input_y, +#endif + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords * const local_mask_ptr, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > * const mask_vector, + const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper, + const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper, + const OP &op + ) { +#ifndef GRB_NO_NOOP_CHECKS + static_assert( !internal::maybe_noop< OP >::value, "Warning: you may be " + "generating an output vector with uninitialised values. Define " + "the GRB_NO_NOOP_CHECKS macro to disable this check.\n" ); +#endif + // create local copies of the input const pointers + OutputType * __restrict__ const z_p = internal::getRaw( z_vector ); + const MaskType * __restrict__ const mask_p = ( masked ) + ? internal::getRaw( *mask_vector ) + : nullptr; + const InputType1 * __restrict__ x_p = x_wrapper.getRaw(); + const InputType2 * __restrict__ y_p = y_wrapper.getRaw(); + + Coords local_mask; + if( masked ) { + local_mask = *local_mask_ptr; + } + + const size_t local_n = upper_bound - lower_bound; + const size_t local_x_nz = already_dense_input_x + ? local_n + : local_x.nonzeroes(); + const size_t local_y_nz = already_dense_input_y + ? local_n + : local_y.nonzeroes(); + + // assertions + assert( !masked || local_mask_ptr != nullptr ); + assert( !masked || local_mask_ptr->size() == local_n ); + assert( x_scalar || local_x_nz <= local_n ); + assert( y_scalar || local_y_nz <= local_n ); + +#ifdef _DEBUG + std::cout << "\tinternal::sparse_apply_generic called\n"; +#endif + constexpr const size_t block_size = OP::blocksize; + + // swap so that we do the expensive pass over the container with the fewest + // nonzeroes first + assert( !x_scalar || !y_scalar ); + const bool swap = ( ( x_scalar || already_dense_input_x ) + ? local_n + : local_x_nz + ) > ( ( y_scalar || already_dense_input_y ) + ? local_n + : local_y_nz + ); + const Coordinates< nonblocking > &loop_coors = swap ? local_y : local_x; + const Coordinates< nonblocking > &chk_coors = swap ? local_x : local_y; + const bool already_dense_loop = swap + ? already_dense_input_y + : already_dense_input_x; + const bool already_dense_chk = swap + ? already_dense_input_x + : already_dense_input_y; + + const size_t loop_coors_nz = swap ? local_y_nz : local_x_nz; + const size_t chk_coors_nz = swap ? local_x_nz : local_y_nz; +#ifdef _DEBUG + std::cout << "\t\tfirst-phase loop of size " << loop_coors.size() << "\n"; + if( x_scalar || y_scalar ) { + std::cout << "\t\tthere will be no second phase because one of the inputs " + << "is scalar\n"; + } else { + std::cout << "\t\tsecond-phase loop of size " << chk_coors.size() << "\n"; + } +#endif + // declare buffers for vectorisation + size_t offsets[ block_size ]; + OutputType z_b[ block_size ]; + InputType1 x_b[ block_size ]; + InputType2 y_b[ block_size ]; + bool mask[ block_size ]; + bool x_m[ block_size ]; + bool y_m[ block_size ]; + + if( x_scalar ) { + for( size_t k = 0; k < block_size; ++k ) { + x_b[ k ] = x_wrapper.getValue(); + } + } + if( y_scalar ) { + for( size_t k = 0; k < block_size; ++k ) { + y_b[ k ] = y_wrapper.getValue(); + } + } + + // expensive pass #1 + size_t start = 0; + size_t end = loop_coors_nz / block_size; + size_t k = 0; + for( size_t b = start; b < end; ++b ) { + // perform gathers + for( size_t i = 0; i < block_size; ++i ) { + const size_t index = ( already_dense_loop ) + ? ( ( k++ ) + lower_bound ) + : ( loop_coors.index( k++ ) + lower_bound ); + offsets[ i ] = index; + assert( index < local_n + lower_bound ); + if( masked ) { + if( already_dense_mask ) { + mask[ i ] = internal::getCoordinates( *mask_vector ).template + mask< descr >( index, mask_p ); + } else { + mask[ i ] = local_mask.template mask< descr >( index - lower_bound, + mask_p + lower_bound ); + } + } + } + // perform gathers + for( size_t i = 0; i < block_size; ++i ) { + if( !masked || mask[ i ] ) { + if( !x_scalar ) { + x_b[ i ] = x_p[ offsets[ i ] ]; + } + if( !x_scalar && !y_scalar ) { + y_m[ i ] = already_dense_chk || chk_coors.assigned( offsets[ i ] - + lower_bound ); + } else { + y_m[ i ] = true; + } + if( !y_scalar ) { + y_b[ i ] = y_p[ offsets[ i ] ]; + } + } else { + y_m[ i ] = false; + } + } + // perform compute + for( size_t i = 0; i < block_size; ++i ) { + RC rc = SUCCESS; + if( y_m[ i ] ) { + rc = apply( z_b[ i ], x_b[ i ], y_b[ i ], op ); + } else if( monoid ) { + if( swap ) { + z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] ); + } else { + z_b[ i ] = static_cast< typename OP::D3 >( y_b[ i ] ); + } + } + assert( rc == SUCCESS ); +#ifdef NDEBUG + (void) rc; +#endif + } + // part that may or may not be vectorised (can we do something about this??) + for( size_t i = 0; i < block_size; ++i ) { + if( !masked || mask[ i ] ) { + if( y_m[ i ] || monoid ) { + (void) local_z.assign( offsets[ i ] - lower_bound ); + } + } + } + // perform scatter + for( size_t i = 0; i < block_size; ++i ) { + if( !masked || mask[ i ] ) { + if( monoid || y_m[ i ] ) { + GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the only way the below could write + // an uninitialised value is if the + // static_assert at the top of this + z_p[ offsets[ i ] ] = z_b[ i ]; // function had triggered. See also + GRB_UTIL_RESTORE_WARNINGS // internal issue #321. + } + } + } + } + + for( ; k < loop_coors_nz; ++k ) { + const size_t index = ( already_dense_loop ) + ? k + lower_bound + : loop_coors.index( k ) + lower_bound; + if( masked ) { + if( already_dense_mask ) { + if( !internal::getCoordinates( *mask_vector ).template mask< descr >( + index, mask_p ) + ) { + continue; + } + } else { + if( !local_mask.template mask< descr >( index - lower_bound, mask_p + + lower_bound ) + ) { + continue; + } + } + } + RC rc = SUCCESS; + (void) local_z.assign( index - lower_bound ); + if( x_scalar || y_scalar || already_dense_chk || chk_coors.assigned( + index - lower_bound ) + ) { + rc = apply( + z_p[ index ], + ( x_scalar ) + ? x_wrapper.getValue() + : x_p[ index ], + ( y_scalar ) + ? y_wrapper.getValue() + : y_p[ index ], + op + ); + } else if( monoid ) { + if( swap ) { + z_p[ index ] = x_scalar ? + static_cast< typename OP::D3 >( x_wrapper.getValue() ) : + static_cast< typename OP::D3 >( x_p[ index ] ); + } else { + z_p[ index ] = y_scalar ? + static_cast< typename OP::D3 >( y_wrapper.getValue() ) : + static_cast< typename OP::D3 >( y_p[ index ] ); + } + } + assert( rc == SUCCESS ); +#ifdef NDEBUG + (void) rc; +#endif + } + + // cheaper pass #2, only required if we are using monoid semantics + // AND if both inputs are vectors + if( monoid && !x_scalar && !y_scalar ) { + start = 0; + end = chk_coors_nz / block_size; + k = 0; + for( size_t b = start; b < end; ++b ) { + // streaming load + for( size_t i = 0; i < block_size; i++ ) { + offsets[ i ] = ( already_dense_chk ) + ? ( ( k++ ) + lower_bound ) + : ( chk_coors.index( k++ ) + lower_bound ); + assert( offsets[ i ] < local_n + lower_bound ); + } + // pure gather + for( size_t i = 0; i < block_size; i++ ) { + x_m[ i ] = already_dense_loop || loop_coors.assigned( offsets[ i ] - + lower_bound ); + } + // gather-like + for( size_t i = 0; i < block_size; i++ ) { + if( masked ) { + if( already_dense_mask ) { + mask[ i ] = utils::interpretMask< descr >( + internal::getCoordinates( *mask_vector ).assigned( offsets[ i ] ), + mask_p, offsets[ i ] + ); + } else { + mask[ i ] = utils::interpretMask< descr >( + local_mask.assigned( offsets[ i ] - lower_bound ), + mask_p, offsets[ i ] + ); + } + } + } + // SIMD + for( size_t i = 0; i < block_size; i++ ) { + x_m[ i ] = ! x_m[ i ]; + } + // SIMD + for( size_t i = 0; i < block_size; i++ ) { + if( masked ) { + mask[ i ] = mask[ i ] && x_m[ i ]; + } + } + if( !swap ) { + // gather + for( size_t i = 0; i < block_size; ++i ) { + if( masked ) { + if( mask[ i ] ) { + y_b[ i ] = y_p[ offsets[ i ] ]; + } + } else { + if( x_m[ i ] ) { + y_b[ i ] = y_p[ offsets[ i ] ]; + } + } + } + // SIMD + for( size_t i = 0; i < block_size; i++ ) { + if( masked ) { + if( mask[ i ] ) { + z_b[ i ] = y_b[ i ]; + } + } else { + if( x_m[ i ] ) { + z_b[ i ] = y_b[ i ]; + } + } + } + } else { + // gather + for( size_t i = 0; i < block_size; ++i ) { + if( masked ) { + if( mask[ i ] ) { + x_b[ i ] = x_p[ offsets[ i ] ]; + } + } else { + if( x_m[ i ] ) { + x_b[ i ] = x_p[ offsets[ i ] ]; + } + } + } + // SIMD + for( size_t i = 0; i < block_size; i++ ) { + if( masked ) { + if( mask[ i ] ) { + z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] ); + } + } else { + if( x_m[ i ] ) { + z_b[ i ] = static_cast< typename OP::D3 >( x_b[ i ] ); + } + } + } + } + // SIMD-like + for( size_t i = 0; i < block_size; i++ ) { + if( masked ) { + if( mask[ i ] ) { + (void)local_z.assign( offsets[ i ] - lower_bound ); + } + } else { + if( x_m[ i ] ) { + (void)local_z.assign( offsets[ i ] - lower_bound ); + } + } + } + // scatter + for( size_t i = 0; i < block_size; i++ ) { + if( masked ) { + if( mask[ i ] ) { + GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED + + z_p[ offsets[ i ] ] = z_b[ i ]; + + GRB_UTIL_RESTORE_WARNINGS + } + } else { + if( x_m[ i ] ) { +#ifdef _DEBUG + std::cout << "\t\t writing out " << z_b[ i ] << " to index " + << offsets[ i ] << "\n"; +#endif + GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // the only way the below could write + // an uninitialised value is if the + // static_assert at the top of this + z_p[ offsets[ i ] ] = z_b[ i ]; // function had triggered. See also + GRB_UTIL_RESTORE_WARNINGS // internal issue #321. + } + } + } + } + for( ; k < chk_coors_nz; ++k ) { + const size_t index = ( ( already_dense_chk ) + ? k + : chk_coors.index( k ) ) + lower_bound; + assert( index < local_n + lower_bound ); + if( already_dense_loop || loop_coors.assigned( index - lower_bound) ) { + continue; + } + if( masked ) { + if( already_dense_mask ) { + if( !internal::getCoordinates( *mask_vector ).template mask< descr >( + index, mask_p ) + ) { + continue; + } + } else { + if( !local_mask.template mask< descr >( index - lower_bound , mask_p + + lower_bound ) + ) { + continue; + } + } + } + (void) local_z.assign( index - lower_bound ); + z_p[ index ] = swap ? x_p[ index ] : y_p[ index ]; + } + } + + return SUCCESS; + } + + template< + bool left_scalar, + bool right_scalar, + bool left_sparse, + bool right_sparse, + Descriptor descr, + class OP, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_mask, + bool already_dense_input_x, + bool already_dense_input_y, +#endif + typename OutputType, typename MaskType, + typename InputType1, typename InputType2, + typename Coords + > + RC masked_apply_generic( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_mask, + bool already_dense_input_x, + bool already_dense_input_y, +#endif + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords &local_mask, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > &mask_vector, + const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper, + const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper, + const OP &op, +#ifdef GRB_BOOLEAN_DISPATCHER + const InputType1 * const left_identity, + const InputType2 * const right_identity +#else + const InputType1 * const left_identity = nullptr, + const InputType2 * const right_identity = nullptr +#endif + ) { +#ifdef _DEBUG + std::cout << "In masked_apply_generic< " << left_scalar << ", " + << right_scalar << ", " << left_sparse << ", " << right_sparse << ", " + << descr << " > with lower_bound = " << lower_bound << " and upper_bound = " + << upper_bound << "\n"; +#endif + // assertions + static_assert( !(left_scalar && left_sparse), + "left_scalar and left_sparse cannot both be set!" + ); + static_assert( !(right_scalar && right_sparse), + "right_scalar and right_sparse cannot both be set!" + ); + assert( !left_sparse || left_identity != nullptr ); + assert( !right_sparse || right_identity != nullptr ); + + // create local copies of the input const pointers + OutputType * __restrict__ const z_p = internal::getRaw( z_vector ); + const MaskType * __restrict__ const mask_p = internal::getRaw( mask_vector ); + const InputType1 * __restrict__ x_p = x_wrapper.getRaw(); + const InputType2 * __restrict__ y_p = y_wrapper.getRaw(); + + const size_t local_n = upper_bound - lower_bound; + const size_t local_mask_nz = ( already_dense_mask ) + ? local_n + : local_mask.nonzeroes(); +#ifdef _DEBUG + std::cout << "\tinternal::masked_apply_generic called with nnz(mask)=" + << local_mask_nz << " and descriptor " << descr << "\n"; + if( local_mask_nz > 0 ) { + std::cout << "\t\tNonzero mask indices: " + << ( already_dense_mask ? 0 : local_mask.index( 0 ) ); + assert( local_mask.assigned( local_mask.index( 0 ) ) ); + for( size_t k = 1; k < local_mask_nz; ++k ) { + std::cout << ", " + << ( ( already_dense_mask ) ? k : local_mask.index( k ) ); + assert( + already_dense_mask || + local_mask.assigned( local_mask.index( k ) ) + ); + } + std::cout << "\n"; + } + + size_t unset = 0; + for( size_t i = 0; i < local_n; ++i ) { + if( !( already_dense_mask || local_mask.assigned( i ) ) ) { + (void) ++unset; + } + } + assert( unset == local_n - local_mask_nz ); +#endif + // whether to use a Theta(n) or a Theta(nnz(mask)) loop + const bool bigLoop = local_mask_nz == local_n || + (descr & descriptors::invert_mask); + + // get block size + constexpr size_t size_t_block_size = config::SIMD_SIZE::value() / + sizeof( size_t ); + constexpr size_t op_block_size = OP::blocksize; + constexpr size_t min_block_size = op_block_size > size_t_block_size + ? size_t_block_size + : op_block_size; + + if( bigLoop ) { +#ifdef _DEBUG + std::cerr << "\t in bigLoop variant\n"; +#endif + size_t i = 0 + lower_bound; + + constexpr const size_t block_size = op_block_size; + const size_t num_blocks = local_n / block_size; + const size_t start = 0; + const size_t end = num_blocks; + + // declare buffers that fit in a single SIMD register and initialise if + // needed + bool mask_b[ block_size ]; + OutputType z_b[ block_size ]; + InputType1 x_b[ block_size ]; + InputType2 y_b[ block_size ]; + for( size_t k = 0; k < block_size; ++k ) { + if( left_scalar ) { + x_b[ k ] = x_wrapper.getValue(); + } + if( right_scalar ) { + y_b[ k ] = y_wrapper.getValue(); + } + } + for( size_t b = start; b < end; ++b ) { + for( size_t k = 0; k < block_size; ++k ) { + const size_t index = i + k; + assert( index < local_n + lower_bound ); + if( already_dense_mask ) { + mask_b[ k ] = internal::getCoordinates( mask_vector ).template + mask< descr >( index, mask_p ); + } else { + mask_b[ k ] = local_mask.template + mask< descr >( index - lower_bound, mask_p + lower_bound ); + } + } + // check for no output + if( left_sparse && right_sparse ) { + for( size_t k = 0; k < block_size; ++k ) { + const size_t index = i + k; + assert( index < local_n + lower_bound ); + if( mask_b[ k ] ) { + if( !( already_dense_input_x || + local_x.assigned( index - lower_bound ) + ) && !( + already_dense_input_y || + local_y.assigned( index - lower_bound ) + ) + ) { + mask_b[ k ] = false; + } + } + } + } + for( size_t k = 0; k < block_size; ++k ) { + const size_t index = i + k; + assert( index < local_n + lower_bound ); + if( mask_b[ k ] ) { + if( !left_scalar ) { + if( left_sparse && !( + already_dense_input_x || local_x.assigned( index - lower_bound ) + ) ) { + x_b[ k ] = *left_identity; + } else { + x_b[ k ] = *( x_p + index ); + } + } + if( !right_scalar ) { + if( right_sparse && !( + already_dense_input_y || local_y.assigned( index - lower_bound ) + ) ) { + y_b[ k ] = *right_identity; + } else { + y_b[ k ] = *( y_p + index ); + } + } + } + } + for( size_t k = 0; k < block_size; ++k ) { + if( mask_b[ k ] ) { + apply( z_b[ k ], x_b[ k ], y_b[ k ], op ); + } + } + for( size_t k = 0; k < block_size; ++k ) { + const size_t index = i + k; + assert( index < local_n + lower_bound ); + if( mask_b[ k ] ) { + (void) local_z.assign( index - lower_bound ); + GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // This is only triggered with + *( z_p + index ) = z_b[ k ]; // mask_b[ k ], which in the above + GRB_UTIL_RESTORE_WARNINGS // loop also triggeres initialising + // z_b[ k ] + } + } + + i += block_size; + } + // scalar coda + for( + size_t i = end * block_size + lower_bound; + i < local_n + lower_bound; + ++i + ) { + if( already_dense_mask ) { + if( !internal::getCoordinates( mask_vector ).template mask< descr >( i, + mask_p ) + ) { + continue; + } + } else { + if( !local_mask.template mask< descr >( i - lower_bound, mask_p + + lower_bound ) + ) { + continue; + } + } + + if( left_sparse && right_sparse ) { + if( !( already_dense_input_x || local_x.assigned( i - lower_bound ) ) && + !( already_dense_input_y || local_y.assigned( i - lower_bound ) ) + ) { + continue; + } + } + (void) local_z.assign( i - lower_bound ); + const InputType1 x_e = left_scalar + ? x_wrapper.getValue() + : ( (!left_sparse || already_dense_input_x || + local_x.assigned( i - lower_bound )) + ? *(x_p + i) + : *left_identity + ); + const InputType2 y_e = right_scalar + ? y_wrapper.getValue() + : ( (!right_sparse || already_dense_input_y || + local_y.assigned( i - lower_bound )) + ? *(y_p + i) + : *right_identity + ); + OutputType * const z_e = z_p + i; + apply( *z_e, x_e, y_e, op ); + } + } else { +#ifdef _DEBUG + std::cerr << "\t in smallLoop variant\n"; +#endif + // declare buffers that fit in a single SIMD register and initialise if + // needed + constexpr const size_t block_size = size_t_block_size > 0 + ? min_block_size + : op_block_size; + bool mask_b[ block_size ]; + OutputType z_b[ block_size ]; + InputType1 x_b[ block_size ]; + InputType2 y_b[ block_size ]; + size_t indices[ block_size ]; + for( size_t k = 0; k < block_size; ++k ) { + if( left_scalar ) { + x_b[ k ] = x_wrapper.getValue(); + } + if( right_scalar ) { + y_b[ k ] = y_wrapper.getValue(); + } + } + + // loop over mask pattern + const size_t mask_nnz = local_mask_nz; + const size_t num_blocks = mask_nnz / block_size; + const size_t start = 0; + const size_t end = num_blocks; + + size_t k = 0; + + // vectorised code + for( size_t b = start; b < end; ++b ) { + for( size_t t = 0; t < block_size; ++t ) { + indices[ t ] = (already_dense_mask ) ? k + t : local_mask.index( k + t ); + } + for( size_t t = 0; t < block_size; ++t ) { + if( already_dense_mask ) { + mask_b[ t ] = internal::getCoordinates( mask_vector ).template + mask< descr >( indices[ t ], mask_p ); + } else { + mask_b[ t ] = local_mask.template + mask< descr >( indices[ t ], mask_p + lower_bound ); + } + } + for( size_t t = 0; t < block_size; ++t ) { + if( mask_b[ t ] ) { + if( !left_scalar ) { + if( left_sparse && !( already_dense_input_x || + local_x.assigned( indices[ t ] ) ) + ) { + x_b[ t ] = *left_identity; + } else { + x_b[ t ] = *( x_p + indices[ t ] + lower_bound ); + } + } + if( !right_scalar ) { + if( right_sparse && !( already_dense_input_y || + local_y.assigned( indices[ t ] ) ) + ) { + y_b[ t ] = *right_identity; + } else { + y_b[ t ] = *( y_p + indices[ t ] + lower_bound ); + } + } + } + } + // check for no output + if( left_sparse && right_sparse ) { + for( size_t t = 0; t < block_size; ++t ) { + const size_t index = indices[ t ]; + assert( index < local_n + lower_bound ); + if( mask_b[ t ] ) { + if( !( already_dense_input_x || local_x.assigned( index ) ) && + !( already_dense_input_y || local_y.assigned( index ) ) + ) { + mask_b[ t ] = false; + } + } + } + } + for( size_t t = 0; t < block_size; ++t ) { + if( mask_b[ t ] ) { + apply( z_b[ t ], x_b[ t ], y_b[ t ], op ); + } + } + for( size_t t = 0; t < block_size; ++t ) { + if( mask_b[ t ] ) { + (void) local_z.assign( indices[ t ] ); + GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // z_b is computed from + *( z_p + indices[ t ] + lower_bound ) = z_b[ t ]; // x_b and y_b, which + GRB_UTIL_RESTORE_WARNINGS // are both initialised + // if mask_b is true + } + } + + k += block_size; + } + + // scalar coda + for( size_t k = end * block_size; k < mask_nnz; ++k ) { + const size_t i = already_dense_mask + ? k + lower_bound + : local_mask.index( k ) + lower_bound; + if( ( already_dense_mask && + internal::getCoordinates( mask_vector ).template mask< descr >( + i, mask_p + ) + ) || local_mask.template mask< descr >( + i - lower_bound, mask_p + lower_bound + ) + ) { + if( left_sparse && right_sparse ) { + if( !( already_dense_input_x || local_x.assigned( i - lower_bound ) ) && + !( already_dense_input_y || local_y.assigned( i - lower_bound ) ) + ) { + continue; + } + } + (void) local_z.assign( i - lower_bound ); + const InputType1 x_e = left_scalar + ? x_wrapper.getValue() + : ( + (!left_sparse || already_dense_input_x || + local_x.assigned( i - lower_bound ) ) + ? *(x_p + i) + : *left_identity + ); + const InputType2 y_e = right_scalar + ? y_wrapper.getValue() + : ( + (!right_sparse || already_dense_input_y || + local_y.assigned( i - lower_bound ) ) + ? *(y_p + i) + : *right_identity + ); + OutputType * const z_e = z_p + i; + apply( *z_e, x_e, y_e, op ); + } + } + } + assert( false ); + return UNSUPPORTED; + } + + } // end namespace ``grb::internal'' + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &x, + const InputType2 beta, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< OP >::value, void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given operator" ); +#ifdef _DEBUG + std::cout << "In eWiseApply ([T1]<-[T2]<-T3), operator variant\n"; +#endif + // sanity check + auto &z_coors = internal::getCoordinates( z ); + const size_t n = z_coors.size(); + if( internal::getCoordinates( x ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&z, &x, beta, &op] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, x, beta, operator) in " + << "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_mask, local_x, local_y, local_z; + const size_t local_n = upper_bound - lower_bound; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_input_x = true; + + size_t local_x_nz = local_n; + + if( !already_dense_vectors ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< false, InputType1, Coords > x_wrapper( x ); + const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta ); + + // the global stack counter must be set to 0 unless it's guaranteed + // that none of the local_clear and local_assignAll will be invoked + // - local_clear is not invoked when the dense descriptor is given, + // since the output vector will eventually become dense + // - local_assignAll is not invoked when the output vector is already dense + // therefore, the following condition relies on global information, + // i.e., the dense descriptor and the already_dense_output + if( !already_dense_vectors ) { + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); + } + } + + if( local_x_nz == local_n ) { + if( !already_dense_vectors ) { + local_z.local_assignAll( ); + } + + // call dense apply +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_dense_apply_generic< +#else + rc = internal::dense_apply_generic< +#endif + false, true, false, false, descr | descriptors::dense, OP, + OutputType, InputType1, InputType2, Coords + >( + already_dense_input_x, true, + lower_bound, upper_bound, + local_x, local_y, + z, x_wrapper, y_wrapper, + op + ); + } else { + if( !already_dense_vectors ) { + local_z.local_clear(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + } + + // since z and x may not perfectly overlap, and since the intersection is + // unknown a priori, we must iterate over the nonzeroes of x +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_apply_generic< +#else + rc = internal::sparse_apply_generic< +#endif + false, false, false, true, descr, OP, + OutputType, bool, InputType1, InputType2, Coords + >( + true, already_dense_input_x, true, + lower_bound, upper_bound, + local_z, local_null_mask, local_x, local_y, + z, null_mask, x_wrapper, y_wrapper, op + ); + } + + if( !already_dense_vectors ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, true, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &x, nullptr, nullptr, nullptr, + &internal::getCoordinates( x ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, beta, operator)" + << std::endl; +#endif + + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const InputType2 beta, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< OP >::value, void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given operator" ); +#ifdef _DEBUG + std::cout << "In eWiseApply ([T1]<-T2<-T3), operator variant\n"; +#endif + if( (descr & descriptors::dense) && nnz( z ) < size( z ) ) { + return ILLEGAL; + } + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + typename OP::D3 val; + RC ret = apply< descr >( val, alpha, beta, op ); + ret = ret ? ret : set< descr >( z, val ); + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename OutputType, typename MaskType, + typename InputType1, typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &mask, + const InputType1 alpha, + const InputType2 beta, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< OP >::value, void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseApply", + "called with an output mask element type that is not Boolean " ); +#ifdef _DEBUG + std::cout << "In masked eWiseApply ([T1]<-T2<-T3), operator variant\n"; +#endif + // check trivial dispatch + if( size( mask ) == 0 ) { + return eWiseApply< descr >( z, alpha, beta, op, phase ); + } + + // dynamic checks + if( size( mask ) != size( z ) ) { + return MISMATCH; + } + if( (descr & descriptors::dense) && + ( nnz( z ) < size( z ) || nnz( mask ) < size( mask ) ) + ) { + return ILLEGAL; + } + + // check trivial dispatch + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + typename OP::D3 val; + RC ret = apply< descr >( val, alpha, beta, op ); + ret = ret ? ret : set< descr >( z, mask, val ); + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const InputType2 beta, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given monoid" ); +#ifdef _DEBUG + std::cout << "In eWiseApply ([T1]<-T2<-T3), monoid variant\n"; +#endif + // simply delegate to operator variant + return eWiseApply< descr >( z, alpha, beta, monoid.getOperator(), phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename OutputType, typename MaskType, + typename InputType1, typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &mask, + const InputType1 alpha, + const InputType2 beta, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseApply", + "called with an output mask element type that is not Boolean " ); +#ifdef _DEBUG + std::cout << "In masked eWiseApply ([T1]<-T2<-T3), monoid variant\n"; +#endif + // simply delegate to operator variant + return eWiseApply< descr >( z, mask, alpha, beta, monoid.getOperator(), + phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &x, + const InputType2 beta, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< OP >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseApply", + "called with an output mask element type that is not Boolean " ); +#ifdef _DEBUG + std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using operator)\n"; +#endif + // check for empty mask + if( size( mask ) == 0 ) { + return eWiseApply< descr >( z, x, beta, op ); + } + + // other run-time checks + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( x ).size() != n ) { + return MISMATCH; + } + if( internal::getCoordinates( mask ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + constexpr const bool dense_mask = dense_descr && + (descr & descriptors::structural) && !(descr & descriptors::invert_mask); + + internal::Pipeline::stage_type func = + [&z, &mask, &x, beta, &op] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, beta, " + << "operator) in the range(" << lower_bound << ", " << upper_bound << ")" + << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_mask, local_x, local_y, local_z; + const size_t local_n = upper_bound - lower_bound; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + const bool mask_is_dense = (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && already_dense_vectors; + + bool already_dense_mask = true; + bool already_dense_input_x = true; + + size_t local_mask_nz = local_n; + size_t local_x_nz = local_n; + + if( !mask_is_dense ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + if( dense_descr && local_z.nonzeroes() < local_n ) { + return ILLEGAL; + } + } + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( mask ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound, + upper_bound ); + local_mask_nz = local_mask.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_mask = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< false, InputType1, Coords > x_wrapper( x ); + const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta ); + + if( !mask_is_dense ) { + // the output sparsity structure is implied by mask and descr + local_z.local_clear(); + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + if( dense_descr ) { + pipeline.markMaybeSparseDenseDescriptorVerification( + &internal::getCoordinates( z ) ); + } + } + } + + if( + (descr & descriptors::dense) || + (local_x_nz == local_n) || + (local_mask_nz <= local_x_nz) + ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_apply_generic< +#else + rc = internal::masked_apply_generic< +#endif + false, true, false, false, descr, OP, + OutputType, MaskType, InputType1, InputType2, Coords + >( + already_dense_mask, already_dense_input_x, true, + lower_bound, upper_bound, + local_z, local_mask, local_x, local_y, + z, mask, x_wrapper, y_wrapper, + op + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_apply_generic< +#else + rc = internal::sparse_apply_generic< +#endif + true, false, false, true, descr, OP, + OutputType, bool, InputType1, InputType2, Coords + >( + already_dense_mask, already_dense_input_x, true, + lower_bound, upper_bound, + local_z, &local_mask, local_x, local_y, + z, &mask, x_wrapper, y_wrapper, + op + ); + } + + if( !mask_is_dense ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_MASKED_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, dense_mask, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &x, &mask, nullptr, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( mask ), + nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, beta, " + << "operator)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given monoid" ); +#ifdef _DEBUG + std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n"; +#endif + // other run-time checks + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( x ).size() != n ) { + return MISMATCH; + } + if( internal::getCoordinates( y ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // check if we can dispatch to dense variant + if( (descr & descriptors::dense) ) { + return eWiseApply< descr >( z, x, y, monoid.getOperator() ); + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [&z, &x, &y, &monoid, phase] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, x, y, monoid) in the " + << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_x, local_y, local_z; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + ( void )pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_input_x = true; + bool already_dense_input_y = true; + + if( !already_dense_vectors ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< false, InputType1, Coords > x_wrapper( x ); + const internal::Wrapper< false, InputType2, Coords > y_wrapper( y ); + + // we are in the unmasked sparse variant + const auto op = monoid.getOperator(); + + if( !already_dense_vectors ) { + // z will have an a-priori unknown sparsity structure + local_z.local_clear(); + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + } + } + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_apply_generic< +#else + rc = internal::sparse_apply_generic< +#endif + false, true, false, false, descr, typename Monoid::Operator, + OutputType, bool, InputType1, InputType2, Coords + >( + true, already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_null_mask, local_x, local_y, + z, null_mask, x_wrapper, y_wrapper, + op + ); + + if( !already_dense_vectors ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, true, + getID( z ), + &z, nullptr, &internal::getCoordinates( z ), nullptr, + getID( x ), getID( y ), SIZE_MAX, SIZE_MAX, + &x, &y, nullptr, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( y ), + nullptr, nullptr, + SIZE_MAX, nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, y, monoid)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given monoid" ); +#ifdef _DEBUG + std::cout << "In unmasked eWiseApply ([T1]<-T2<-[T3], using monoid)\n"; +#endif + // other run-time checks + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( y ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + // check if we can dispatch to dense variant + if( (descr & descriptors::dense) ) { + return eWiseApply< descr >( z, alpha, y, monoid.getOperator() ); + } + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [&z, alpha, &y, &monoid] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, alpha, y, monoid) in the " + << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_y, local_z; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + bool already_dense_output = true; +#endif + bool already_dense_input_y = true; + + // when it's guaranteed that the output will become dense + // the only criterion to avoid reading the local coordinates is if it the + // output is already dense +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( z ) ); + if( !already_dense_output ) { +#endif + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha ); + const internal::Wrapper< false, InputType2, Coords > y_wrapper( y ); + + // we are in the unmasked sparse variant + const auto &op = monoid.getOperator(); + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#endif + local_z.local_assignAllNotAlreadyAssigned(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + + // dispatch to generic function +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_dense_apply_generic< +#else + rc = internal::dense_apply_generic< +#endif + true, false, false, true, descr, typename Monoid::Operator, + OutputType, InputType1, InputType2, Coords + >( + true, already_dense_input_y, + lower_bound, upper_bound, + local_x, local_y, + z, x_wrapper, y_wrapper, op + ); + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, true, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &y, nullptr, nullptr, nullptr, + &internal::getCoordinates( y ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, alpha, y, monoid)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &x, + const InputType2 beta, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, + void >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given monoid" ); +#ifdef _DEBUG + std::cout << "In unmasked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n"; +#endif + // other run-time checks + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( x ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // check if we can dispatch to dense variant + if( (descr & descriptors::dense) ) { + return eWiseApply< descr >( z, x, beta, monoid.getOperator() ); + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [&z, &x, beta, &monoid] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, x, beta, monoid) in the " + << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_y, local_z; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + bool already_dense_output = true; +#endif + bool already_dense_input_x = true; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( z ) ); + if( !already_dense_output ) { +#endif + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< false, InputType1, Coords > x_wrapper( x ); + const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta ); + + // we are in the unmasked sparse variant + const auto &op = monoid.getOperator(); + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#endif + // the result will always be dense + local_z.local_assignAllNotAlreadyAssigned(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + + // dispatch +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_dense_apply_generic< +#else + rc = internal::dense_apply_generic< +#endif + false, true, true, false, descr, typename Monoid::Operator, + OutputType, InputType1, InputType2, Coords + >( + already_dense_input_x, true, + lower_bound, upper_bound, + local_x, local_y, + z, x_wrapper, y_wrapper, + op + ); + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, true, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &x, nullptr, nullptr, nullptr, + &internal::getCoordinates( x ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, beta, monoid)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseApply", + "called with an output mask element type that is not Boolean " ); +#ifdef _DEBUG + std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using monoid)\n"; +#endif + if( size( mask ) == 0 ) { + return eWiseApply< descr >( z, x, y, monoid, phase ); + } + + // other run-time checks + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( x ).size() != n ) { + return MISMATCH; + } + if( internal::getCoordinates( y ).size() != n ) { + return MISMATCH; + } + if( internal::getCoordinates( mask ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // check if we can dispatch to dense variant + if( (descr & descriptors::dense) ) { + return eWiseApply< descr >( z, mask, x, y, monoid.getOperator() ); + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + constexpr const bool dense_mask = dense_descr && + (descr & descriptors::structural) && !(descr & descriptors::invert_mask); + + internal::Pipeline::stage_type func = [&z, &mask, &x, &y, &monoid] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, y, monoid) in " + << "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_mask, local_x, local_y, local_z; + const size_t local_n = upper_bound - lower_bound; + size_t local_mask_nz = local_n; + size_t local_x_nz = local_n; + size_t local_y_nz = local_n; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + const bool mask_is_dense = (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && already_dense_vectors; + + bool already_dense_mask = true; + bool already_dense_input_x = true; + bool already_dense_input_y = true; + + if( !mask_is_dense ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + if( dense_descr && local_z.nonzeroes() < local_n ) { + return ILLEGAL; + } + } + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( mask ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound, + upper_bound ); + local_mask_nz = local_mask.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< false, InputType1, Coords > x_wrapper( x ); + const internal::Wrapper< false, InputType2, Coords > y_wrapper( y ); + + // we are in the masked sparse variant + const InputType1 left_identity = monoid.template getIdentity< InputType1 >(); + const InputType2 right_identity = + monoid.template getIdentity< InputType2 >(); + const auto &op = monoid.getOperator(); + + if( !mask_is_dense ) { + // z will have an a priori unknown sparsity structure + local_z.local_clear(); + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + if( dense_descr ) { + pipeline.markMaybeSparseDenseDescriptorVerification( + &internal::getCoordinates( z ) ); + } + } + } + + if( local_x_nz < local_n && + local_y_nz < local_n && + local_x_nz + local_y_nz < local_mask_nz + ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_apply_generic< +#else + rc = internal::sparse_apply_generic< +#endif + true, true, false, false, descr, typename Monoid::Operator, + OutputType, bool, InputType1, InputType2, Coords + >( + already_dense_mask, already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, &local_mask, local_x, local_y, + z, &mask, x_wrapper, y_wrapper, + op + ); + } else if( local_x_nz < local_n && local_y_nz == local_n ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_apply_generic< +#else + rc = internal::masked_apply_generic< +#endif + false, false, true, false, descr, typename Monoid::Operator, + OutputType, MaskType, InputType1, InputType2, Coords + >( + already_dense_mask, already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_mask, local_x, local_y, + z, mask, x_wrapper, y_wrapper, + op, + &left_identity, nullptr + ); + } else if( local_y_nz < local_n && local_x_nz == local_n ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_apply_generic< +#else + rc = internal::masked_apply_generic< +#endif + false, false, false, true, descr, typename Monoid::Operator, + OutputType, MaskType, InputType1, InputType2, Coords + >( + already_dense_mask, already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_mask, local_x, local_y, + z, mask, x_wrapper, y_wrapper, + op, + nullptr, &right_identity + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_apply_generic< +#else + rc = internal::masked_apply_generic< +#endif + false, false, true, true, descr, typename Monoid::Operator, + OutputType, MaskType, InputType1, InputType2, Coords + >( + already_dense_mask, already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_mask, local_x, local_y, + z, mask, x_wrapper, y_wrapper, + op, + &left_identity, &right_identity + ); + } + + if( !mask_is_dense ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_MASKED_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, dense_mask, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &x, &y, &mask, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( y ), + &internal::getCoordinates( mask ), nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, y, " + << "monoid)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &mask, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, + void >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseApply", + "called with an output mask element type that is not Boolean " ); +#ifdef _DEBUG + std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], using monoid)\n"; +#endif + if( size( mask ) == 0 ) { + return eWiseApply< descr >( z, alpha, y, monoid ); + } + + // other run-time checks + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( y ).size() != n ) { + return MISMATCH; + } + if( internal::getCoordinates( mask ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // check if we can dispatch to dense variant + if( descr & descriptors::dense ) { + return eWiseApply< descr >( z, mask, alpha, y, monoid.getOperator() ); + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + constexpr const bool dense_mask = dense_descr && + (descr & descriptors::structural) && !(descr & descriptors::invert_mask); + + internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &monoid] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, mask, alpha, y, monoid) " + << "in the range(" << lower_bound << ", " << upper_bound << ")" + << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_mask, local_x, local_y, local_z; + const size_t local_n = upper_bound - lower_bound; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + const bool mask_is_dense = (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && already_dense_vectors; + + bool already_dense_mask = true; + bool already_dense_input_y = true; + + if( !mask_is_dense ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + if( dense_descr && local_z.nonzeroes() < local_n ) { + return ILLEGAL; + } + } + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( mask ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha ); + const internal::Wrapper< false, InputType2, Coords > y_wrapper( y ); + + // we are in the masked sparse variant + const InputType2 right_identity = + monoid.template getIdentity< InputType2 >(); + const auto &op = monoid.getOperator(); + + if( !mask_is_dense ) { + // the sparsity structure of z will be a result of the given mask and descr + local_z.local_clear(); + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + if( dense_descr ) { + pipeline.markMaybeSparseDenseDescriptorVerification( + &internal::getCoordinates( z ) ); + } + } + } + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_apply_generic< +#else + rc = internal::masked_apply_generic< +#endif + true, false, false, true, descr, typename Monoid::Operator, + OutputType, MaskType, InputType1, InputType2, Coords + >( + already_dense_mask, true, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_mask, local_x, local_y, + z, mask, x_wrapper, y_wrapper, + op, + nullptr, &right_identity + ); + + if( !mask_is_dense ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_MASKED_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, dense_mask, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &y, &mask, nullptr, nullptr, + &internal::getCoordinates( y ), &internal::getCoordinates( mask ), + nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, alpha, y, " + << "monoid)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &x, + const InputType2 beta, + const Monoid &monoid = Monoid(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< Monoid >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given monoid" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseApply", + "called with an output mask element type that is not Boolean " ); +#ifdef _DEBUG + std::cout << "In masked eWiseApply ([T1]<-[T2]<-T3, using monoid)\n"; +#endif + if( size( mask ) == 0 ) { + return eWiseApply< descr >( z, x, beta, monoid ); + } + + // other run-time checks + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( x ).size() != n ) { + return MISMATCH; + } + if( internal::getCoordinates( mask ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // check if we can dispatch to dense variant + if( (descr & descriptors::dense) ) { + return eWiseApply< descr >( z, mask, x, beta, monoid.getOperator() ); + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + constexpr const bool dense_mask = dense_descr && + (descr & descriptors::structural) && !(descr & descriptors::invert_mask); + + internal::Pipeline::stage_type func = [&z, &mask, &x, beta, &monoid] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, beta, monoid) " + << "in the range(" << lower_bound << ", " << upper_bound << ")" + << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_mask, local_x, local_y, local_z; + const size_t local_n = upper_bound - lower_bound; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + const bool mask_is_dense = (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && already_dense_vectors; + + bool already_dense_mask = true; + bool already_dense_input_x = true; + + if( !mask_is_dense ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + if( dense_descr && local_z.nonzeroes() < local_n ) { + return ILLEGAL; + } + } + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( mask ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< false, InputType1, Coords > x_wrapper( x ); + const internal::Wrapper< true, InputType2, Coords > y_wrapper( beta ); + + // we are in the masked sparse variant + const InputType1 left_identity = monoid.template getIdentity< InputType1 >(); + const auto &op = monoid.getOperator(); + + if( !mask_is_dense ) { + local_z.local_clear(); + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + if( dense_descr ) { + pipeline.markMaybeSparseDenseDescriptorVerification( + &internal::getCoordinates( z ) ); + } + } + } + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_apply_generic< +#else + rc = internal::masked_apply_generic< +#endif + false, true, true, false, descr, typename Monoid::Operator, + OutputType, MaskType, InputType1, InputType2, Coords + >( + already_dense_mask, already_dense_input_x, true, + lower_bound, upper_bound, + local_z, local_mask, local_x, local_y, + z, mask, x_wrapper, y_wrapper, + op, + &left_identity + ); + + if( !mask_is_dense ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_MASKED_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, dense_mask, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &x, &mask, nullptr, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( mask ), + nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, beta, " + << "monoid)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &y, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< OP >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given operator" ); +#ifdef _DEBUG + std::cout << "In eWiseApply ([T1]<-T2<-[T3]), operator variant\n"; +#endif + // sanity check + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( y ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // check if we can dispatch + if( static_cast< const void * >( &z ) == + static_cast< const void * >( &y ) + ) { + return foldr< descr >( alpha, z, op ); + } + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [&z, alpha, &y, &op] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, alpha, y, operator) in " + << "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_mask, local_x, local_y, local_z; + const size_t local_n = upper_bound - lower_bound; + size_t local_y_nz = local_n; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_input_y = true; + + if( !already_dense_vectors ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha ); + const internal::Wrapper< false, InputType2, Coords > y_wrapper( y ); + + if( !already_dense_vectors ) { + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); + } + } + + // check for dense variant + if( (descr & descriptors::dense) || local_y_nz == local_n ) { + if( !already_dense_vectors ) { + local_z.local_assignAll( ); + } + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_dense_apply_generic< +#else + rc = internal::dense_apply_generic< +#endif + true, false, false, false, descr, OP, + OutputType, InputType1, InputType2, Coords + >( + true, already_dense_input_y, + lower_bound, upper_bound, + local_x, local_y, z, + x_wrapper, y_wrapper, + op + ); + } else { + if( !already_dense_vectors ) { + local_z.local_clear(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + } + + // we are in the sparse variant +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_apply_generic< + false, false, true, false, descr, OP, +#else + rc = internal::sparse_apply_generic< + false, false, true, false, descr, OP, +#endif + OutputType, bool, InputType1, InputType2, Coords + >( + true, true, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_null_mask, local_x, local_y, + z, null_mask, x_wrapper, y_wrapper, + op + ); + } + + if( !already_dense_vectors ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, true, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &y, nullptr, nullptr, nullptr, + &internal::getCoordinates( y ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, alpha, y, " + << "operator)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &mask, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &y, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< OP >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseApply", + "called with an output mask element type that is not Boolean " ); +#ifdef _DEBUG + std::cout << "In masked eWiseApply ([T1]<-T2<-[T3], operator variant)\n"; +#endif + // check for empty mask + if( size( mask ) == 0 ) { + return eWiseApply< descr >( z, alpha, y, op ); + } + + // sanity check + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( y ).size() != n ) { + return MISMATCH; + } + if( internal::getCoordinates( mask ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + constexpr const bool dense_mask = dense_descr && + (descr & descriptors::structural) && !(descr & descriptors::invert_mask); + + internal::Pipeline::stage_type func = [&z, &mask, alpha, &y, &op] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, mask, alpha, y, " + << "operator) in the range(" << lower_bound << ", " << upper_bound << ")" + << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_mask, local_x, local_y, local_z; + const size_t local_n = upper_bound - lower_bound; + size_t local_mask_nz = local_n; + size_t local_y_nz = local_n; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + const bool mask_is_dense = (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && already_dense_vectors; + + bool already_dense_mask = true; + bool already_dense_input_y = true; + + if( !mask_is_dense ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + if( dense_descr && local_z.nonzeroes() < local_n ) { + return ILLEGAL; + } + } + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( mask ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound, + upper_bound ); + local_mask_nz = local_mask.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< true, InputType1, Coords > x_wrapper( alpha ); + const internal::Wrapper< false, InputType2, Coords > y_wrapper( y ); + + if( !mask_is_dense ) { + local_z.local_clear(); + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + if( dense_descr ) { + pipeline.markMaybeSparseDenseDescriptorVerification( + &internal::getCoordinates( z ) ); + } + } + } + + if( (descr & descriptors::dense) || + (local_y_nz == local_n) || + local_mask_nz <= local_y_nz + ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_apply_generic< +#else + rc = internal::masked_apply_generic< +#endif + true, false, false, false, descr, OP, + OutputType, bool, InputType1, InputType2, Coords + >( + already_dense_mask, true, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_mask, local_x, local_y, + z, mask, x_wrapper, y_wrapper, + op + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_apply_generic< +#else + rc = internal::sparse_apply_generic< +#endif + true, false, true, false, descr, OP, + OutputType, bool, InputType1, InputType2, Coords + >( + already_dense_mask, true, already_dense_input_y, + lower_bound, upper_bound, + local_z, &local_mask, local_x, local_y, + z, &mask, x_wrapper, y_wrapper, + op + ); + } + + if( !mask_is_dense ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_MASKED_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, dense_mask, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &y, &mask, nullptr, nullptr, + &internal::getCoordinates( y ), &internal::getCoordinates( mask ), + nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, alpha, y, " + << "operator)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< OP >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given operator" ); +#ifdef _DEBUG + std::cout << "In eWiseApply ([T1]<-[T2]<-[T3]), operator variant\n"; +#endif + // sanity check + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( x ).size() != n || + internal::getCoordinates( y ).size() != n + ) { +#ifdef _DEBUG + std::cerr << "\tinput vectors mismatch in dimensions!\n"; +#endif + return MISMATCH; + } + + // check for possible shortcuts + // trivial dispatch + if( n == 0 ) { + return SUCCESS; + } + + // check for possible shortcuts, after dynamic checks + if( getID( x ) == getID( y ) && is_idempotent< OP >::value ) { + return set< descr >( z, x, phase ); + } + if( getID( x ) == getID( z ) ) { + return foldl< descr >( z, y, op, phase ); + } + if( getID( y ) == getID( z ) ) { + return foldr< descr >( x, z, op, phase ); + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [&z, &x, &y, &op] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, x, y, operator) in the " + << "range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + const Coords * const local_null_mask = nullptr; + + Coords local_x, local_y, local_z; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + size_t local_y_nz = local_n; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_input_x = true; + bool already_dense_input_y = true; + + if( !already_dense_vectors ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( !already_dense_vectors ) { + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); + } + } + + if( sparse ) { + if( !already_dense_vectors ) { + local_z.local_clear(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + } + + const internal::Wrapper< false, InputType1, Coords > x_wrapper( x ); + const internal::Wrapper< false, InputType2, Coords > y_wrapper( y ); + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_apply_generic< +#else + rc = internal::sparse_apply_generic< +#endif + false, false, false, false, descr | descriptors::dense, OP, + OutputType, bool, InputType1, InputType2, Coords + >( + true, already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_null_mask, local_x, local_y, + z, null_mask, x_wrapper, y_wrapper, + op + ); + } else { + if( !already_dense_vectors ) { + local_z.local_assignAll( ); + } + + if( upper_bound > lower_bound ) { + const InputType1 * __restrict__ a = internal::getRaw( x ); + const InputType2 * __restrict__ b = internal::getRaw( y ); + OutputType * __restrict__ c = internal::getRaw( z ); + + // this function is vectorised + op.eWiseApply( a + lower_bound, b + lower_bound, c + lower_bound, local_n); + } + } + + if( !already_dense_vectors ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, true, + getID( z ), + &z, nullptr, &internal::getCoordinates( z ), nullptr, + getID( x ), getID( y ), SIZE_MAX, SIZE_MAX, + &x, &y, nullptr, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( y ), + nullptr, nullptr, + SIZE_MAX, nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, x, y, operator)" + << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class OP, + typename OutputType, typename MaskType, + typename InputType1, typename InputType2, + typename Coords + > + RC eWiseApply( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const OP &op = OP(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< OP >::value, + void + >::type * const = nullptr + ) { + // static checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D1, InputType1 >::value ), "grb::eWiseApply", + "called with a left-hand input element type that does not match the " + "first domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D2, InputType2 >::value ), "grb::eWiseApply", + "called with a right-hand input element type that does not match the " + "second domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename OP::D3, OutputType >::value ), "grb::eWiseApply", + "called with an output element type that does not match the " + "third domain of the given operator" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::eWiseApply", + "called with an output mask element type that is not Boolean " ); +#ifdef _DEBUG + std::cout << "In masked eWiseApply ([T1]<-[T2]<-[T3], using operator)\n"; +#endif + // check for empty mask + if( size( mask ) == 0 ) { + return eWiseApply< descr >( z, x, y, op, phase ); + } + + // other run-time checks + const size_t n = internal::getCoordinates( z ).size(); + if( internal::getCoordinates( x ).size() != n ) { + return MISMATCH; + } + if( internal::getCoordinates( y ).size() != n ) { + return MISMATCH; + } + if( internal::getCoordinates( mask ).size() != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + constexpr const bool dense_mask = dense_descr && + (descr & descriptors::structural) && !(descr & descriptors::invert_mask); + + internal::Pipeline::stage_type func = [&z, &mask, &x, &y, &op] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseApply(z, mask, x, y, operator) in " + << "the range(" << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_mask, local_x, local_y, local_z; + const size_t local_n = upper_bound - lower_bound; + size_t local_mask_nz = local_n; + size_t local_x_nz = local_n; + size_t local_y_nz = local_n; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + const bool mask_is_dense = (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && already_dense_vectors; + + bool already_dense_mask = true; + bool already_dense_input_x = true; + bool already_dense_input_y = true; + + if( !mask_is_dense ) { + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + if( dense_descr && local_z.nonzeroes() < local_n ) { + return ILLEGAL; + } + } + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( mask ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound, + upper_bound ); + local_mask_nz = local_mask.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + const internal::Wrapper< false, InputType1, Coords > x_wrapper( x ); + const internal::Wrapper< false, InputType2, Coords > y_wrapper( y ); + + const size_t sparse_loop = std::min( local_x_nz, local_y_nz ); + + if( !mask_is_dense ) { + local_z.local_clear(); + if( lower_bound == 0 ) { + internal::getCoordinates( z ).reset_global_nnz_counter(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( z ) ); +#endif + if( dense_descr ) { + pipeline.markMaybeSparseDenseDescriptorVerification( + &internal::getCoordinates( z ) ); + } + } + } + + if( (descr & descriptors::dense) || + (local_x_nz == local_n && local_y_nz == local_n) || + ( !(descr & descriptors::invert_mask) && sparse_loop >= local_mask_nz ) + ) { + // use loop over mask +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_apply_generic< +#else + rc = internal::masked_apply_generic< +#endif + false, false, false, false, descr, OP, + OutputType, bool, InputType1, InputType2, Coords + >( + already_dense_mask, already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_mask, local_x, local_y, + z, mask, x_wrapper, y_wrapper, + op + ); + + } else { + // use loop over sparse inputs +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_apply_generic< +#else + rc = internal::sparse_apply_generic< +#endif + true, false, false, false, descr, OP, + OutputType, bool, InputType1, InputType2, Coords + >( + already_dense_mask, already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, &local_mask, local_x, local_y, + z, &mask, x_wrapper, y_wrapper, + op + ); + } + + if( !mask_is_dense ) { + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_MASKED_EWISEAPPLY, + n, sizeof( OutputType ), dense_descr, dense_mask, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &x, &y, &mask, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( y ), + &internal::getCoordinates( mask ), nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseApply(z, mask, x, y, " + << "operator)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd", + "called with an output vector with element type that does not match the " + "fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, InputType1 >::value ), "grb::eWiseAdd", + "called with a left-hand side input vector with element type that does not " + "match the third domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd", + "called with a right-hand side input vector with element type that does " + "not match the fourth domain of the given semiring" ); +#ifdef _DEBUG + std::cout << "eWiseAdd (ascend, vector <- vector + vector) dispatches to " + << "two folds using the additive monoid\n"; +#endif + RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase ); + ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase ); + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename Coords + > + RC eWiseAdd( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); +#ifdef _DEBUG + std::cout << "eWiseAdd (ascend, vector <- scalar + vector) dispatches to " + << "two folds with the additive monoid\n"; +#endif + RC ret = foldl< descr >( z, alpha, ring.getAdditiveMonoid(), phase ); + ret = ret ? ret : foldl< descr >( z, y, ring.getAdditiveMonoid(), phase ); + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename Coords + > + RC eWiseAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &x, + const InputType2 beta, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); +#ifdef _DEBUG + std::cout << "eWiseAdd (ascend, vector <- vector + scalar) dispatches to " + << "two folds with the additive monoid\n"; +#endif + RC ret = foldl< descr >( z, x, ring.getAdditiveMonoid(), phase ); + ret = ret ? ret : foldl< descr >( z, beta, ring.getAdditiveMonoid(), phase ); + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename Coords + > + RC eWiseAdd( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const InputType2 beta, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); +#ifdef _DEBUG + std::cout << "eWiseAdd (ascend, vector <- scalar + scalar) dispatches to " + << "foldl with precomputed scalar and additive monoid\n"; +#endif + const typename Ring::D4 add; + (void) apply( add, alpha, beta, ring.getAdditiveOperator() ); + return foldl< descr >( z, add, ring.getAdditiveMonoid(), phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC eWiseAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd", + "called with an output vector with element type that does not match the " + "fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, InputType1 >::value ), "grb::eWiseAdd", + "called with a left-hand side input vector with element type that does not " + "match the third domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), "grb::eWiseAdd", + "called with a right-hand side input vector with element type that does " + "not match the fourth domain of the given semiring" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< MaskType, bool >::value ), + "grb::eWiseAdd (vector <- vector + vector, masked)", + "called with non-bool mask element types" ); +#ifdef _DEBUG + std::cout << "eWiseAdd (ascend, vector <- vector + vector, masked) " + << "dispatches to two folds using the additive monoid\n"; +#endif + RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase ); + ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase ); + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< MaskType, bool >::value ), + "grb::eWiseAdd (vector <- scalar + vector, masked)", + "called with non-bool mask element types" ); +#ifdef _DEBUG + std::cout << "eWiseAdd (ascend, vector <- scalar + vector, masked) " + << "dispatches to two folds using the additive monoid\n"; +#endif + RC ret = foldl< descr >( z, m, alpha, ring.getAdditiveMonoid(), phase ); + ret = ret ? ret : foldl< descr >( z, m, y, ring.getAdditiveMonoid(), phase ); + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType1, ascend, Coords > &x, + const InputType2 beta, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< MaskType, bool >::value ), + "grb::eWiseAdd (vector <- vector + scalar, masked)", + "called with non-bool mask element types" ); +#ifdef _DEBUG + std::cout << "eWiseAdd (ascend, vector <- vector + scalar, masked) " + << "dispatches to eWiseApply using the additive monoid\n"; +#endif + RC ret = foldl< descr >( z, m, x, ring.getAdditiveMonoid(), phase ); + ret = ret ? ret : foldl< descr >( z, m, beta, ring.getAdditiveMonoid(), + phase ); + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< OutputType, ascend, Coords > &m, + const InputType1 alpha, + const InputType2 beta, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), "grb::eWiseAdd", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), "grb::eWiseAdd", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), "grb::eWiseAdd", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< MaskType, bool >::value ), + "grb::eWiseAdd (vector <- scalar + scalar, masked)", + "called with non-bool mask element types" ); +#ifdef _DEBUG + std::cout << "eWiseAdd (ascend, vector <- scalar + scalar, masked) " + << "dispatches to foldl with precomputed scalar and additive monoid\n"; +#endif + const typename Ring::D4 add; + (void) apply( add, alpha, beta, ring.getAdditiveOperator() ); + return foldl< descr >( z, m, add, ring.getAdditiveMonoid(), phase ); + } + + // declare an internal version of eWiseMulAdd containing the full sparse & + // dense implementations + namespace internal { + + template< + Descriptor descr, + bool a_scalar, + bool x_scalar, + bool y_scalar, + bool y_zero, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_output, + bool already_dense_mask, + bool already_dense_input_a, + bool already_dense_input_x, + bool already_dense_input_y, +#endif + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords, + class Ring + > + RC sparse_eWiseMulAdd_maskDriven( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_output, + bool already_dense_mask, + bool already_dense_input_a, + bool already_dense_input_x, + bool already_dense_input_y, +#endif + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords &local_m, + const Coords &local_a, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > &m_vector, + const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper, + const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper, + const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper, + const Ring &ring + ) { + static_assert( !(descr & descriptors::invert_mask), + "Cannot loop over mask nonzeroes if invert_mask is given. " + "Please submit a bug report" ); + static_assert( !a_scalar || !x_scalar, + "If both a and x are scalars, this is operation is a simple eWiseApply " + "with the additive operator if the semiring." ); + static_assert( !y_zero || y_scalar, + "If y_zero is given, then y_scalar must be given also." ); + + OutputType * __restrict__ z = internal::getRaw( z_vector ); + const MaskType * __restrict__ const m = internal::getRaw( m_vector ); + + // create local copies of the input const pointers + const InputType1 * __restrict__ const a = a_wrapper.getRaw(); + const InputType2 * __restrict__ const x = x_wrapper.getRaw(); + const InputType3 * __restrict__ const y = y_wrapper.getRaw(); + + const size_t local_n = upper_bound - lower_bound; + const size_t local_m_nz = already_dense_mask ? local_n : local_m.nonzeroes(); + + const size_t local_start = 0; + const size_t local_end = local_m_nz; + + size_t k = local_start; + + // scalar coda and parallel main body + for( ; k < local_end; ++k ) { + const size_t index = ( already_dense_mask ? k : local_m.index( k ) ) + + lower_bound; + assert( index - lower_bound < local_n ); + if( already_dense_mask ) { + if( !internal::getCoordinates( m_vector ).template mask< descr >( + index, m ) + ) { + continue; + } + } else { + if( !local_m.template mask< descr >( index - lower_bound, m + + lower_bound ) + ) { + continue; + } + } + typename Ring::D3 t = ring.template getZero< typename Ring::D3 >(); + if( + ( + a_scalar || already_dense_input_a || + local_a.assigned( index - lower_bound ) + ) && ( + x_scalar || already_dense_input_x || + local_x.assigned( index - lower_bound) + ) + ) { + const InputType1 a_p = ( a_scalar ) + ? a_wrapper.getValue() + : *( a + index ); + const InputType2 x_p = ( x_scalar ) + ? x_wrapper.getValue() + : *( x + index ); + (void) apply( t, a_p, x_p, ring.getMultiplicativeOperator() ); + if( !y_zero && ( + y_scalar || already_dense_input_y || + local_y.assigned( index - lower_bound ) ) + ) { + const InputType3 y_p = ( y_scalar ) + ? y_wrapper.getValue() + : *( y + index ); + typename Ring::D4 b; + (void) apply( b, t, y_p, ring.getAdditiveOperator() ); + if( already_dense_output || local_z.assigned( index - lower_bound ) ) { + typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] ); + (void) foldr( b, out, ring.getAdditiveOperator() ); + z[ index ] = static_cast< OutputType >( out ); + } else { + (void) local_z.assign( index - lower_bound ); + z[ index ] = static_cast< OutputType >( b ); + } + } else if( already_dense_output || + local_z.assigned( index - lower_bound ) + ) { + typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] ); + (void) foldr( t, out, ring.getAdditiveOperator() ); + z[ index ] = static_cast< OutputType >( out ); + } else { + (void) local_z.assign( index - lower_bound ); + z[ index ] = static_cast< OutputType >( t ); + } + } else if( !y_zero && ( + already_dense_input_y || y_scalar || + local_y.assigned( index - lower_bound ) ) + ) { + if( already_dense_output || local_z.assigned( index - lower_bound ) ) { + typename Ring::D4 out = static_cast< typename Ring::D4 >( z[ index ] ); + (void) foldr( y[ index ], out, ring.getAdditiveOperator() ); + z[ index ] = static_cast< OutputType >( out ); + } else { + (void)local_z.assign( index - lower_bound ); + z[ index ] = static_cast< OutputType >( t ); + } + } + } + + assert( false ); + return UNSUPPORTED; + } + + template< + Descriptor descr, + bool masked, + bool x_scalar, + bool y_scalar, + bool y_zero, + bool mulSwitched, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_output, + bool already_dense_mask, + bool already_dense_input_a, + bool already_dense_input_x, + bool already_dense_input_y, +#endif + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords, + class Ring + > + RC twoPhase_sparse_eWiseMulAdd_mulDriven( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_output, + bool already_dense_mask, + bool already_dense_input_a, + bool already_dense_input_x, + bool already_dense_input_y, +#endif + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords * const local_m, + const Coords &local_a, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > * const m_vector, + const Vector< InputType1, ascend, Coords > &a_vector, + const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper, + const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper, + const Ring &ring + ) { + OutputType * __restrict__ z = internal::getRaw( z_vector ); + const MaskType * __restrict__ const m = masked + ? internal::getRaw( *m_vector ) + : nullptr; + const InputType1 * __restrict__ const a = internal::getRaw( a_vector ); + + // create local copies of the input const pointers + const InputType2 * __restrict__ const x = x_wrapper.getRaw(); + + const size_t local_n = upper_bound - lower_bound; + const size_t local_a_nz = already_dense_input_a + ? local_n + : local_a.nonzeroes(); + + for( size_t i = 0; i < local_a_nz; ++i ) { + const size_t index = ( already_dense_input_a ? i : local_a.index( i ) ) + + lower_bound; + if( masked ) { + if( already_dense_mask ) { + if( !internal::getCoordinates( *m_vector ).template mask< descr >( + index, m ) + ) { + continue; + } + } else { + if( !local_m->template mask< descr >( index - lower_bound, + m + lower_bound ) + ) { + continue; + } + } + } + + if( x_scalar || already_dense_input_x || + local_x.assigned( index - lower_bound ) + ) { + typename Ring::D3 t; + const InputType1 a_p = *( a + index ); + const InputType2 x_p = ( x_scalar ) + ? x_wrapper.getValue() + : *( x + index ); + + if( mulSwitched ) { + (void) apply( t, x_p, a_p, ring.getMultiplicativeOperator() ); + } else { + (void) apply( t, a_p, x_p, ring.getMultiplicativeOperator() ); + } + + if( already_dense_output || local_z.assign( index - lower_bound ) ) { + typename Ring::D4 b = static_cast< typename Ring::D4 >( z[ index ] ); + (void) foldr( t, b, ring.getAdditiveOperator() ); + z[ index ] = static_cast< OutputType >( b ); + } else { + z[ index ] = static_cast< OutputType >( + static_cast< typename Ring::D4 >( t ) + ); + } + } + } + + RC rc = SUCCESS; + + // now handle addition + if( !y_zero ) { + // now handle addition + if( masked ) { + if( y_scalar ) { + rc = internal::fold_from_scalar_to_vector_generic< +#ifdef GRB_BOOLEAN_DISPATCHER + descr, true, true, true, true, + already_dense_output, already_dense_mask +#else + descr, true, true, true, true +#endif + >( +#ifndef GRB_BOOLEAN_DISPATCHER + already_dense_output, already_dense_mask, +#endif + lower_bound, upper_bound, local_z, local_m, + z_vector, m_vector, y_wrapper.getValue(), + ring.getAdditiveMonoid().getOperator(), EXECUTE + ); + } else { + rc = fold_from_vector_to_vector_generic< +#ifdef GRB_BOOLEAN_DISPATCHER + descr, true, true, true, true, + already_dense_output, already_dense_input_y, already_dense_mask +#else + descr, true, true, true, true +#endif + >( +#ifndef GRB_BOOLEAN_DISPATCHER + already_dense_output, already_dense_input_y, already_dense_mask, +#endif + lower_bound, upper_bound, + local_z, local_m, local_y, + z_vector, m_vector, *( y_wrapper.getPointer() ), + ring.getAdditiveMonoid().getOperator(), EXECUTE + ); + } + } else { + if( y_scalar ) { + rc = fold_from_scalar_to_vector_generic< +#ifdef GRB_BOOLEAN_DISPATCHER + descr, true, true, false, true, + already_dense_output, already_dense_mask +#else + descr, true, true, false, true +#endif + >( +#ifndef GRB_BOOLEAN_DISPATCHER + already_dense_output, already_dense_mask, +#endif + lower_bound, upper_bound, + local_z, local_m, + z_vector, m_vector, y_wrapper.getValue(), + ring.getAdditiveMonoid().getOperator(), EXECUTE + ); + } else { + rc = fold_from_vector_to_vector_generic< +#ifdef GRB_BOOLEAN_DISPATCHER + descr, true, true, false, true, + already_dense_output, already_dense_input_y, already_dense_mask +#else + descr, true, true, false, true +#endif + >( +#ifndef GRB_BOOLEAN_DISPATCHER + already_dense_output, already_dense_input_y, already_dense_mask, +#endif + lower_bound, upper_bound, + local_z, local_m, local_y, + z_vector, m_vector, *( y_wrapper.getPointer() ), + ring.getAdditiveMonoid().getOperator(), EXECUTE + ); + } + } + } + + // done + return rc; + } + + template< + Descriptor descr, + bool a_scalar, + bool x_scalar, + bool y_scalar, + bool y_zero, + bool assign_z, + typename OutputType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords, + class Ring + > + RC dense_eWiseMulAdd( + const size_t lower_bound, + const size_t upper_bound, + Vector< OutputType, ascend, Coords > &z_vector, + const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper, + const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper, + const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper, + const Ring &ring = Ring() + ) { +#ifdef _DEBUG + std::cout << "\tdense_eWiseMulAdd: loop size will be " + << (upper_bound - lower_bound) << " in the range(" << lower_bound << ", " + << upper_bound << ")\n"; +#endif + const size_t start = lower_bound; + const size_t end = upper_bound; + + OutputType * __restrict__ z = internal::getRaw( z_vector ); + + // create local copies of the input const pointers + const InputType1 * __restrict__ a = a_wrapper.getRaw(); + const InputType2 * __restrict__ x = x_wrapper.getRaw(); + const InputType3 * __restrict__ y = y_wrapper.getRaw(); + + assert( z != a ); + assert( z != x ); + assert( z != y ); + assert( a != x || a == nullptr ); + assert( a != y || a == nullptr ); + assert( x != y || x == nullptr ); + + // vector registers + typename Ring::D1 aa[ Ring::blocksize ]; + typename Ring::D2 xx[ Ring::blocksize ]; + typename Ring::D3 tt[ Ring::blocksize ]; + typename Ring::D4 bb[ Ring::blocksize ]; + typename Ring::D4 yy[ Ring::blocksize ]; + typename Ring::D4 zz[ Ring::blocksize ]; + + if( a_scalar ) { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + aa[ b ] = a_wrapper.getValue(); + } + } + if( x_scalar ) { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + xx[ b ] = x_wrapper.getValue(); + } + } + if( y_scalar ) { + if( y_zero ) { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + yy[ b ] = ring.template getZero< typename Ring::D4 >(); + } + } else { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + yy[ b ] = y_wrapper.getValue(); + } + } + } + + // do vectorised out-of-place operations. Allows for aligned overlap. + // Non-aligned ovelap is not possible due to GraphBLAS semantics. + size_t i = start; + // note: read the tail code (under this while loop) comments first for + // greater understanding + while( i + Ring::blocksize <= end ) { +#ifdef _DEBUG + std::cout << "\tdense_eWiseMulAdd: handling block of size " + << Ring::blocksize << " starting at index " << i << "\n"; +#endif + // read-in + if( !a_scalar ) { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + aa[ b ] = static_cast< typename Ring::D2 >( a[ i + b ] ); + } + } + if( !x_scalar ) { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + xx[ b ] = static_cast< typename Ring::D2 >( x[ i + b ] ); + } + } + if( !y_scalar ) { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + yy[ b ] = static_cast< typename Ring::D4 >( y[ i + b ] ); + } + } + if( !assign_z ) { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + zz[ b ] = static_cast< typename Ring::D4 >( z[ i + b ] ); + } + } + + // operate + if( !y_zero ) { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + apply( tt[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() ); + apply( bb[ b ], tt[ b ], yy[ b ], ring.getAdditiveOperator() ); + } + } else { + assert( y_scalar ); + for( size_t b = 0; b < Ring::blocksize; ++b ) { + apply( bb[ b ], aa[ b ], xx[ b ], ring.getMultiplicativeOperator() ); + } + } + if( !assign_z ) { + for( size_t b = 0; b < Ring::blocksize; ++b ) { + foldr( bb[ b ], zz[ b ], ring.getAdditiveOperator() ); + } + } + + // write-out + if( assign_z ) { + for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) { + z[ i ] = static_cast< OutputType >( bb[ b ] ); + } + } else { + for( size_t b = 0; b < Ring::blocksize; ++b, ++i ) { + z[ i ] = static_cast< OutputType >( zz[ b ] ); + } + } + } + + // perform tail + if( !a_scalar ) { + a += i; + } + if( !x_scalar ) { + x += i; + } + if( !y_scalar ) { + y += i; + } + z += i; + for( ; i < end; ++i ) { + // do multiply + const typename Ring::D1 &as = ( a_scalar ) + ? static_cast< typename Ring::D1 >( a_wrapper.getValue() ) + : static_cast< typename Ring::D1 >( *a ); + const typename Ring::D2 &xs = ( x_scalar ) + ? static_cast< typename Ring::D2 >( x_wrapper.getValue() ) + : static_cast< typename Ring::D2 >( *x ); + typename Ring::D4 ys = ( y_scalar ) + ? static_cast< typename Ring::D4 >( y_wrapper.getValue() ) + : static_cast< typename Ring::D4 >( *y ); + typename Ring::D3 ts; + + if( !y_zero ) { + RC always_succeeds = apply( ts, as, xs, ring.getMultiplicativeOperator() ); + assert( always_succeeds == SUCCESS ); + always_succeeds = foldr( ts, ys, ring.getAdditiveOperator() ); + assert( always_succeeds == SUCCESS ); +#ifdef NDEBUG + (void) always_succeeds; +#endif + } else { + RC always_succeeds = apply( ys, as, xs, ring.getMultiplicativeOperator() ); + assert( always_succeeds == SUCCESS ); +#ifdef NDEBUG + (void) always_succeeds; +#endif + } + + // write out + if( assign_z ) { + *z = static_cast< OutputType >( ys ); + } else { + RC always_succeeds = foldr( ys, *z, ring.getAdditiveOperator() ); + assert( always_succeeds == SUCCESS ); +#ifdef NDEBUG + (void) always_succeeds; +#endif + } + + // move pointers + if( !a_scalar ) { + (void)a++; + } + if( !x_scalar ) { + (void)x++; + } + if( !y_scalar ) { + (void)y++; + } + (void)z++; + } + + // done + assert( false ); + return UNSUPPORTED; + } + + template< + Descriptor descr, + bool masked, + bool a_scalar, + bool x_scalar, + bool y_scalar, + bool y_zero, + typename MaskType, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename Coords + > + RC eWiseMulAdd_dispatch( + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > * const m_vector, + const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper, + const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper, + const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper, + const size_t n, + const Ring &ring + ) { + static_assert( !y_zero || y_scalar, "If y is zero, y_scalar must be true. " + "Triggering this assertion indicates an incorrect call to this " + "function; please submit a bug report" ); +#ifdef _DEBUG + std::cout << "\t in eWiseMulAdd_dispatch\n"; +#endif + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, &ring] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseMulAdd_dispatch in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_z, local_m, local_a, local_x, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_z_nz = local_n; + size_t local_m_nz = local_n; + size_t local_a_nz = local_n; + size_t local_x_nz = local_n; + size_t local_y_nz = local_n; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + bool already_dense_output = true; + bool already_dense_mask = true; + bool already_dense_input_a = true; + bool already_dense_input_x = true; + bool already_dense_input_y = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( z_vector ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_z = internal::getCoordinates( z_vector ).asyncSubset( lower_bound, + upper_bound ); + local_z_nz = local_z.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + if( masked ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( *m_vector ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_m = internal::getCoordinates( *m_vector ).asyncSubset( + lower_bound, upper_bound ); + local_m_nz = local_m.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( !a_scalar ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_a = pipeline.containsAlreadyDenseVector( + a_wrapper.getCoordinates() ); + if ( !already_dense_input_a ) { +#else + already_dense_input_a = false; +#endif + local_a = a_wrapper.getCoordinates()->asyncSubset( lower_bound, + upper_bound ); + local_a_nz = local_a.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( !x_scalar ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_x = pipeline.containsAlreadyDenseVector( + x_wrapper.getCoordinates() ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = x_wrapper.getCoordinates()->asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( !y_scalar ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_y = pipeline.containsAlreadyDenseVector( + y_wrapper.getCoordinates() ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = y_wrapper.getCoordinates()->asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + } + + // check whether we are in the sparse or dense case + const bool mask_is_dense = !masked || ( + (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && + local_m_nz == local_n + ); + const size_t z_nns = local_z_nz; + + // the below Boolean shall be true only if the inputs a, x, and y generate + // a dense output vector. It furthermore shall be set to false only if the + // output vector was either empty or fully dense. This is done to determine + // the exact case the dense variant of the eWiseMulAdd implementations can + // be used. + const bool sparse = ( a_scalar ? false : ( local_a_nz < local_n ) ) || + ( x_scalar ? false : ( local_x_nz < local_n ) ) || + ( y_scalar ? false : ( local_y_nz < local_n ) ) || + ( z_nns > 0 && z_nns < local_n ) || + ( masked && !mask_is_dense ); + assert( !(sparse && dense_descr) ); +#ifdef _DEBUG + std::cout << "\t\t (sparse, dense)=(" << sparse << ", " << dense_descr + << ")\n"; +#endif + // pre-assign coors if output is dense but was previously totally empty + const bool assign_z = z_nns == 0 && !sparse; + + if( assign_z ) { +#ifdef _DEBUG + std::cout << "\t\t detected output will be dense while " + << "the output vector presently is completely empty. We therefore " + << "pre-assign all output coordinates\n"; +#endif +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#endif + // the result will always be dense + local_z.local_assignAllNotAlreadyAssigned(); + local_z_nz = local_z.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( !dense_descr && sparse ) { + // the below computes loop sizes multiplied with the number of vectors that + // each loop needs to touch. Possible vectors are: z, m, a, x, and y. + const size_t mask_factor = masked ? 1 : 0; + const size_t mul_loop_size = ( 3 + mask_factor ) * std::min( + ( a_scalar ? local_n : local_a_nz ), + ( x_scalar ? local_n : local_x_nz ) + ) + ( y_zero ? 0 : + (2 + mask_factor) * ( y_scalar ? local_n : local_y_nz ) + ); +#ifdef _DEBUG + std::cout << "\t\t mul_loop_size = " << mul_loop_size << "\n"; +#endif + + const size_t mask_loop_size = ( y_zero ? 4 : 5 ) * local_m_nz; + + if( masked && mask_loop_size < mul_loop_size ) { +#ifdef _DEBUG + std::cout << "\t\t mask_loop_size= " << mask_loop_size << "\n"; + std::cout << "\t\t will be driven by output mask\n"; +#endif + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = boolean_dispatcher_sparse_eWiseMulAdd_maskDriven< +#else + rc = sparse_eWiseMulAdd_maskDriven< +#endif + descr, a_scalar, x_scalar, y_scalar, y_zero + >( + already_dense_output, already_dense_mask, already_dense_input_a, + already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, local_m, local_a, local_x, local_y, + z_vector, *m_vector, a_wrapper, x_wrapper, y_wrapper, + ring + ); + } else { +#ifdef _DEBUG + std::cout << "\t\t will be driven by the multiplication a*x\n"; +#endif + static_assert( !(a_scalar && x_scalar), + "The case of the multiplication being between two scalars should have" + "been caught earlier. Please submit a bug report." ); + + if( a_scalar ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven< +#else + rc = twoPhase_sparse_eWiseMulAdd_mulDriven< +#endif + descr, masked, a_scalar, y_scalar, y_zero, true + >( + already_dense_output, already_dense_mask, already_dense_input_x, + already_dense_input_a, already_dense_input_y, + lower_bound, upper_bound, + local_z, &local_m, local_x, local_a, local_y, + z_vector, m_vector, *(x_wrapper.getPointer()), a_wrapper, y_wrapper, + ring + ); + } else if( x_scalar ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven< +#else + rc = twoPhase_sparse_eWiseMulAdd_mulDriven< +#endif + descr, masked, x_scalar, y_scalar, y_zero, false + >( + already_dense_output, already_dense_mask, already_dense_input_a, + already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, &local_m, local_a, local_x, local_y, + z_vector, m_vector, *(a_wrapper.getPointer()), x_wrapper, y_wrapper, + ring + ); + } else if( local_a_nz <= local_x_nz ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven< +#else + rc = twoPhase_sparse_eWiseMulAdd_mulDriven< +#endif + descr, masked, x_scalar, y_scalar, y_zero, false + >( + already_dense_output, already_dense_mask, already_dense_input_a, + already_dense_input_x, already_dense_input_y, + lower_bound, upper_bound, + local_z, &local_m, local_a, local_x, local_y, + z_vector, m_vector, *(a_wrapper.getPointer()), x_wrapper, y_wrapper, + ring + ); + } else { + assert( local_x_nz < local_a_nz ); +#ifdef GRB_BOOLEAN_DISPATCHER + rc = boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven< +#else + rc = twoPhase_sparse_eWiseMulAdd_mulDriven< +#endif + descr, masked, a_scalar, y_scalar, y_zero, true + >( + already_dense_output, already_dense_mask, already_dense_input_x, + already_dense_input_a, already_dense_input_y, + lower_bound, upper_bound, + local_z, &local_m, local_x, local_a, local_y, + z_vector, m_vector, *(x_wrapper.getPointer()), a_wrapper, y_wrapper, + ring + ); + } + } + } else { + // all that remains is the dense case + assert( a_scalar || local_a_nz == local_n ); + assert( x_scalar || local_x_nz == local_n ); + assert( y_scalar || local_y_nz == local_n ); + assert( ! masked || mask_is_dense ); + assert( local_z_nz == local_n ); +#ifdef _DEBUG + std::cout << "\t\t will perform a dense eWiseMulAdd\n"; +#endif + if( assign_z ) { + rc = dense_eWiseMulAdd< + descr, a_scalar, x_scalar, y_scalar, y_zero, true + >( + lower_bound, upper_bound, + z_vector, a_wrapper, x_wrapper, y_wrapper, + ring + ); + } else { + rc = dense_eWiseMulAdd< + descr, a_scalar, x_scalar, y_scalar, y_zero, false + >( + lower_bound, upper_bound, + z_vector, a_wrapper, x_wrapper, y_wrapper, + ring + ); + } + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( z_vector ).asyncJoinSubset( local_z, + lower_bound, upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_EWISEMULADD_DISPATCH, + n, sizeof( OutputType ), dense_descr, true, + &z_vector, nullptr, &internal::getCoordinates( z_vector ), nullptr, + masked ? m_vector : nullptr, a_wrapper.getPointer(), + x_wrapper.getPointer(), y_wrapper.getPointer(), + masked ? &internal::getCoordinates( *m_vector ) : nullptr, + a_wrapper.getCoordinates(), x_wrapper.getCoordinates(), + y_wrapper.getCoordinates(), + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseMulAdd_dispatch" + << std::endl; +#endif + return ret; + } + + } // namespace internal + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &x, + const Vector< InputType3, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value, + void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + + // dynamic sanity checks + const size_t n = size( z ); + if( size( x ) != n || size( y ) != n ) { + return MISMATCH; + } + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // catch trivial cases + const InputType1 zeroIT1 = ring.template getZero< InputType1 >(); + if( alpha == zeroIT1 ) { + return foldl< descr >( z, y, ring.getAdditiveMonoid() ); + } + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + + const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha ); + const internal::Wrapper< false, InputType2, Coords > x_wrapper( x ); + const internal::Wrapper< false, InputType3, Coords > y_wrapper( y ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, false, true, false, false, false, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &a, + const InputType2 chi, + const Vector< InputType3, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + + // dynamic sanity checks + const size_t n = size( z ); + if( size( a ) != n || size( y ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // catch trivial cases + const InputType1 zeroIT2 = ring.template getZero< InputType2 >(); + if( chi == zeroIT2 ) { + return foldl< descr >( z, y, ring.getAdditiveMonoid() ); + } + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + + const internal::Wrapper< false, InputType1, Coords > a_wrapper( a ); + const internal::Wrapper< true, InputType2, Coords > x_wrapper( chi ); + const internal::Wrapper< false, InputType3, Coords > y_wrapper( y ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, false, false, true, false, false, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + bool y_zero = false, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &a, + const Vector< InputType2, ascend, Coords > &x, + const InputType3 gamma, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + + // dynamic sanity checks + const size_t n = size( z ); + if( size( a ) != n || size( x ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + + const internal::Wrapper< false, InputType1, Coords > a_wrapper( a ); + const internal::Wrapper< false, InputType2, Coords > x_wrapper( x ); + const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, false, false, false, true, y_zero, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + bool y_zero = false, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &a, + const InputType2 beta, + const InputType3 gamma, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + + // dynamic sanity checks + const size_t n = size( z ); + if( size( a ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // catch trivial dispatches + const InputType2 zeroIT2 = ring.template getZero< InputType2 >(); + if( beta == zeroIT2 ) { + return foldl< descr >( z, gamma, ring.getAdditiveMonoid() ); + } + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + + const internal::Wrapper< false, InputType1, Coords > a_wrapper( a ); + const internal::Wrapper< true, InputType2, Coords > x_wrapper( beta ); + const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, false, false, true, true, y_zero, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + bool y_zero = false, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &x, + const InputType3 gamma, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value, void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + + // dynamic sanity checks + const size_t n = size( z ); + if( size( x ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // catch trivial cases + const InputType1 zeroIT1 = ring.template getZero< InputType1 >(); + if( alpha == zeroIT1 ) { + return foldl< descr >( z, gamma, ring.getAdditiveMonoid() ); + } + + const Vector< bool, ascend, Coords > * null_mask = nullptr; + + const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha ); + const internal::Wrapper< false, InputType2, Coords > x_wrapper( x ); + const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, false, true, false, true, y_zero, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename OutputType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const InputType2 beta, + const Vector< InputType3, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "First domain of semiring does not match first input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Second domain of semiring does not match second input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Fourth domain of semiring does not match third input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Fourth domain of semiring does not match output type" ); +#ifdef _DEBUG + std::cout << "eWiseMulAdd (ascend, vector <- scalar x scalar + vector) " + << "precomputes scalar multiply and dispatches to eWiseAdd (ascend, " + << "vector <- scalar + vector)\n"; +#endif + // dynamic checks + const size_t n = size( z ); + if( size( y ) != n ) { return MISMATCH; } + + typename Ring::D3 mul_result; + RC rc = grb::apply( mul_result, alpha, beta, + ring.getMultiplicativeOperator() ); +#ifdef NDEBUG + (void) rc; +#else + assert( rc == SUCCESS ); +#endif + return eWiseAdd< descr >( z, mul_result, y, ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename OutputType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const InputType2 beta, + const InputType3 gamma, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value, void + >::type * const = nullptr + ) { + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "First domain of semiring does not match first input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Second domain of semiring does not match second input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Fourth domain of semiring does not match third input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Fourth domain of semiring does not match output type" ); +#ifdef _DEBUG + std::cout << "eWiseMulAdd (ascend, vector <- scalar x scalar + scalar) " + << "precomputes scalar operations and dispatches to set (ascend)\n"; +#endif + typename Ring::D3 mul_result; + RC rc = grb::apply( mul_result, alpha, beta, + ring.getMultiplicativeOperator() ); +#ifdef NDEBUG + (void) rc; +#endif + assert( rc == SUCCESS ); + typename Ring::D4 add_result; + rc = grb::apply( add_result, mul_result, gamma, ring.getAdditiveOperator() ); +#ifdef NDEBUG + (void) rc; +#endif + assert( rc == SUCCESS ); + return grb::foldl< descr >( z, add_result, ring.getAdditiveMonoid(), phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &a, + const Vector< InputType2, ascend, Coords > &x, + const Vector< InputType3, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + (void) ring; + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand vector a with an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + + // dynamic sanity checks + const size_t n = size( z ); + if( size( x ) != n || size( y ) != n || size( a ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + const Vector< bool, ascend, Coords > * const null_mask = nullptr; + + const internal::Wrapper< false, InputType1, Coords > a_wrapper( a ); + const internal::Wrapper< false, InputType2, Coords > x_wrapper( x ); + const internal::Wrapper< false, InputType3, Coords > y_wrapper( y ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, false, false, false, false, false, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, null_mask, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &x, + const Vector< InputType3, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value && + !grb::is_object< MaskType >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector m with a non-bool element type" ); + + // catch empty mask + if( size( m ) == 0 ) { + return eWiseMulAdd< descr >( z, alpha, x, y, ring, phase ); + } + + // dynamic sanity checks + const size_t n = size( z ); + if( size( x ) != n || size( y ) != n || size( m ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // catch trivial cases + const InputType1 zeroIT1 = ring.template getZero< InputType1 >(); + if( alpha == zeroIT1 ) { + return foldl< descr >( z, m, y, ring.getAdditiveMonoid() ); + } + + const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha ); + const internal::Wrapper< false, InputType2, Coords > x_wrapper( x ); + const internal::Wrapper< false, InputType3, Coords > y_wrapper( y ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, true, true, false, false, false, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType1, ascend, Coords > &a, + const InputType2 chi, + const Vector< InputType3, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value && + !grb::is_object< MaskType >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector m with a non-bool element type" ); + + // catch empty mask + if( size( m ) == 0 ) { + return eWiseMulAdd< descr >( z, a, chi, y, ring, phase ); + } + + // dynamic sanity checks + const size_t n = size( z ); + if( size( a ) != n || size( y ) != n || size( m ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // catch trivial cases + const InputType1 zeroIT2 = ring.template getZero< InputType2 >(); + if( chi == zeroIT2 ) { + return foldl< descr >( z, m, y, ring.getAdditiveMonoid() ); + } + + const internal::Wrapper< false, InputType1, Coords > a_wrapper( a ); + const internal::Wrapper< true, InputType2, Coords > x_wrapper( chi ); + const internal::Wrapper< false, InputType3, Coords > y_wrapper( y ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, true, false, true, false, false, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + bool y_zero = false, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType1, ascend, Coords > &a, + const Vector< InputType2, ascend, Coords > &x, + const InputType3 gamma, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value && + !grb::is_object< MaskType >::value, void + >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector m with a non-bool element type" ); + + // catch empty mask + if( size( m ) == 0 ) { + return eWiseMulAdd< descr, y_zero >( z, a, x, gamma, ring, phase ); + } + + // dynamic sanity checks + const size_t n = size( z ); + if( size( a ) != n || size( x ) != n || size( m ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + const internal::Wrapper< false, InputType1, Coords > a_wrapper( a ); + const internal::Wrapper< false, InputType2, Coords > x_wrapper( x ); + const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, true, false, false, true, y_zero, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + bool y_zero = false, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType1, ascend, Coords > &a, + const InputType2 beta, + const InputType3 gamma, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value && + !grb::is_object< MaskType >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector m with a non-bool element type" ); + + // catch empty mask + if( size( m ) == 0 ) { + return eWiseMulAdd< descr, y_zero >( z, a, beta, gamma, ring, phase ); + } + + // dynamic sanity checks + const size_t n = size( z ); + if( size( a ) != n || size( m ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // catch trivial dispatch + const InputType2 zeroIT2 = ring.template getZero< InputType2 >(); + if( zeroIT2 == beta ) { +#ifdef _DEBUG + std::cout << "eWiseMulAdd (ascend, masked, vector<-vector<-scalar<-" + << "scalar) dispatches to foldl\n"; +#endif + return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() ); + } + + const internal::Wrapper< false, InputType1, Coords > a_wrapper( a ); + const internal::Wrapper< true, InputType2, Coords > x_wrapper( beta ); + const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, true, false, true, true, y_zero, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + bool y_zero = false, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &x, + const InputType3 gamma, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value && + !grb::is_object< MaskType >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand scalar alpha of an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector m with a non-bool element type" ); + + // catch empty mask + if( size( m ) == 0 ) { + return eWiseMulAdd< descr, y_zero >( z, alpha, x, gamma, ring, phase ); + } + + // dynamic sanity checks + const size_t n = size( z ); + if( size( x ) != n || size( m ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // catch trivial dispatch + const InputType1 zeroIT1 = ring.template getZero< InputType1 >(); + if( alpha == zeroIT1 ) { +#ifdef _DEBUG + std::cout << "eWiseMulAdd (ascend, masked, vector<-scalar<-scalar<-" + << "scalar) dispatches to foldl\n"; +#endif + return foldl< descr >( z, m, gamma, ring.getAdditiveMonoid() ); + } + + const internal::Wrapper< true, InputType1, Coords > a_wrapper( alpha ); + const internal::Wrapper< false, InputType2, Coords > x_wrapper( x ); + const internal::Wrapper< true, InputType3, Coords > y_wrapper( gamma ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, true, true, false, true, y_zero, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename InputType3, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType1, ascend, Coords > &a, + const Vector< InputType2, ascend, Coords > &x, + const Vector< InputType3, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value && + !grb::is_object< MaskType >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd", + "called with a left-hand vector a with an element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd", + "called with a right-hand vector x with an element type that does not " + "match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd", + "called with an additive vector y with an element type that does not " + "match the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd", + "called with a result vector z with an element type that does not match " + "the fourth domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector m with a non-bool element type" ); + + // catch empty mask + if( size( m ) == 0 ) { + return eWiseMulAdd< descr >( z, a, x, y, ring, phase ); + } + + // dynamic sanity checks + const size_t n = size( z ); + if( size( x ) != n || size( y ) != n || size( a ) != n || size( m ) != n ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + const internal::Wrapper< false, InputType1, Coords > a_wrapper( a ); + const internal::Wrapper< false, InputType2, Coords > x_wrapper( x ); + const internal::Wrapper< false, InputType3, Coords > y_wrapper( y ); + + // sparse or dense case + return internal::eWiseMulAdd_dispatch< + descr, true, false, false, false, false, + bool, Ring, InputType1, InputType2, InputType3, OutputType, Coords + >( z, &m, a_wrapper, x_wrapper, y_wrapper, n, ring ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const InputType1 alpha, + const InputType2 beta, + const Vector< InputType3, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value && + !grb::is_object< MaskType >::value, void + >::type * const = nullptr + ) { + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "First domain of semiring does not match first input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Second domain of semiring does not match second input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Fourth domain of semiring does not match third input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Fourth domain of semiring does not match output type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector with a non-bool element type" ); +#ifdef _DEBUG + std::cout << "eWiseMulAdd (ascend, vector <- scalar x scalar + vector, " + << "masked) precomputes scalar multiply and dispatches to eWiseAdd " + << "(ascend, vector <- scalar + vector, masked)\n"; +#endif + // dynamic checks + const size_t n = size( z ); + if( size( m ) != n || size( y ) != n ) { + return MISMATCH; + } + + typename Ring::D3 mul_result; + RC rc = grb::apply( mul_result, alpha, beta, + ring.getMultiplicativeOperator() ); +#ifdef NDEBUG + (void) rc; +#else + assert( rc == SUCCESS ); +#endif + return grb::eWiseAdd< descr >( z, m, mul_result, y, ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords + > + RC eWiseMulAdd( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const InputType1 alpha, + const InputType2 beta, + const InputType3 gamma, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "First domain of semiring does not match first input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Second domain of semiring does not match second input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, InputType3 >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Fourth domain of semiring does not match third input type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D4, OutputType >::value ), + "grb::eWiseMulAdd(vector,scalar,scalar,scalar)", + "Fourth domain of semiring does not match output type" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector with a non-bool element type" ); +#ifdef _DEBUG + std::cout << "eWiseMulAdd (ascend, vector <- scalar x scalar + scalar, " + << "masked) precomputes scalar operations and dispatches to foldl " + << "(ascend, masked)\n"; +#endif + // dynamic checks + const size_t n = size( z ); + if( size( m ) != n ) { + return MISMATCH; + } + + typename Ring::D3 mul_result; + RC rc = grb::apply( mul_result, alpha, beta, + ring.getMultiplicativeOperator() ); + assert( rc == SUCCESS ); +#ifdef NDEBUG + (void) rc; +#endif + typename Ring::D4 add_result; + rc = grb::apply( add_result, mul_result, gamma, ring.getAdditiveOperator() ); + assert( rc == SUCCESS ); +#ifdef NDEBUG + (void) rc; +#endif + return grb::foldl( z, m, add_result, ring.getAdditiveMonoid(), phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename Coords + > + RC eWiseMul( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const Ring & ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMul", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMul", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), + "grb::eWiseMul", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + + // dynamic checks + const size_t n = size( z ); + if( size( x ) != n || size( y ) != n ) { + return MISMATCH; + } + + // check trivial phase + if( phase == RESIZE ) { + return SUCCESS; + } + +#ifdef _DEBUG + std::cout << "eWiseMul (ascend, vector <- vector x vector) dispatches " + << "to eWiseMulAdd (vector <- vector x vector + 0)\n"; +#endif + return eWiseMulAdd< descr, true >( + z, x, y, ring.template getZero< typename Ring::D4 >(), ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename Coords + > + RC eWiseMul( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMul", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMul", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), + "grb::eWiseMul", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + + // dynamic checks + const size_t n = size( z ); + if( size( y ) != n ) { return MISMATCH; } + + // check for trivial phase + if( phase == RESIZE ) { + return SUCCESS; + } + + // check trivial + if( alpha == ring.template getZero< typename Ring::D1 >() ) { + return SUCCESS; + } + +#ifdef _DEBUG + std::cout << "eWiseMul (ascend, vector <- scalar x vector) dispatches " + << "to eWiseMulAdd (vector <- scalar x vector + 0)\n"; +#endif + return eWiseMulAdd< descr, true >( + z, alpha, y, ring.template getZero< typename Ring::D4 >(), ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename Coords + > + RC eWiseMul( + Vector< OutputType, ascend, Coords > &z, + const Vector< InputType1, ascend, Coords > &x, + const InputType2 beta, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMul", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMul", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), + "grb::eWiseMul", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + + // dynamic checks + const size_t n = size( z ); + if( size( x ) != n ) { + return MISMATCH; + } + + // catch trivial phase + if( phase == RESIZE ) { + return SUCCESS; + } + + // check trivial + if( beta == ring.template getZero< typename Ring::D2 >() ) { + return SUCCESS; + } + +#ifdef _DEBUG + std::cout << "eWiseMul (ascend) dispatches to eWiseMulAdd with 0.0 as " + << "additive scalar\n"; +#endif + + return eWiseMulAdd< descr, true >( + z, x, beta, ring.template getZero< typename Ring::D4 >(), ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename Coords + > + RC eWiseMul( + Vector< OutputType, ascend, Coords > &z, + const InputType1 alpha, + const InputType2 beta, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMul", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMul", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), + "grb::eWiseMul", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + + // check for trivial phase + if( phase == RESIZE ) { + return SUCCESS; + } + + // check trivial + if( alpha == ring.template getZero< typename Ring::D1 >() ) { + return SUCCESS; + } + if( beta == ring.template getZero< typename Ring::D2 >() ) { + return SUCCESS; + } + +#ifdef _DEBUG + std::cout << "eWiseMul (ascend) dispatches to scalar apply and foldl\n"; +#endif + typename Ring::D3 temp; + RC always_success = apply( temp, alpha, beta, + ring.getMultiplicativeOperator() ); + assert( always_success == SUCCESS ); +#ifdef NDEBUG + (void) always_success; +#endif + return foldl< descr >( z, temp, ring.getAdditiveMonoid(), phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMul( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< MaskType >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMul", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMul", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), + "grb::eWiseMul", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector with a non-bool element type" ); + + // check for empty mask + if( size( m ) == 0 ) { + return eWiseMul< descr >( z, x, y, ring, phase ); + } + + // dynamic checks + const size_t n = size( z ); + if( size( m ) != n || size( x ) != n || size( y ) != n ) { + return MISMATCH; + } + + // check trivial phase + if( phase == RESIZE ) { + return SUCCESS; + } + +#ifdef _DEBUG + std::cout << "eWiseMul (ascend, vector <- vector x vector, masked) " + << "dispatches to eWiseMulAdd (vector <- vector x vector + 0, masked)\n"; +#endif + return eWiseMulAdd< descr, true >( + z, m, x, y, ring.template getZero< typename Ring::D4 >(), ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMul( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const InputType1 alpha, + const Vector< InputType2, ascend, Coords > &y, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< MaskType >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMul", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMul", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), + "grb::eWiseMul", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector _m with a non-bool element type" ); + + // check for empty mask + if( size( m ) == 0 ) { + return eWiseMul< descr >( z, alpha, y, ring, phase ); + } + + // dynamic checks + const size_t n = size( z ); + if( size( m ) != n || size( y ) != n ) { return MISMATCH; } + + // check for trivial phase + if( phase == RESIZE ) { + return SUCCESS; + } + + // check trivial + if( alpha == ring.template getZero< typename Ring::D1 >() ) { + return SUCCESS; + } + +#ifdef _DEBUG + std::cout << "eWiseMul (ascend, vector <- scalar x vector, masked) " + << "dispatches to eWiseMulAdd (vector <- scalar x vector + 0, masked)\n"; +#endif + return eWiseMulAdd< descr, true >( + z, m, alpha, y, ring.template getZero< typename Ring::D4 >(), ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMul( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const Vector< InputType1, ascend, Coords > &x, + const InputType2 beta, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< MaskType >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMul", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMul", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), + "grb::eWiseMul", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector _m with a non-bool element type" ); + + // check for empty mask + if( size( m ) == 0 ) { + return eWiseMul< descr >( z, x, beta, ring, phase ); + } + + // dynamic checks + const size_t n = size( z ); + if( size( m ) != n || size( x ) != n ) { return MISMATCH; } + + // check for trivial phase + if( phase == RESIZE ) { + return SUCCESS; + } + + // check trivial + if( beta == ring.template getZero< typename Ring::D2 >() ) { + return SUCCESS; + } + +#ifdef _DEBUG + std::cout << "eWiseMul (ascend, masked) dispatches to masked " + << "eWiseMulAdd with 0.0 as additive scalar\n"; +#endif + return eWiseMulAdd< descr, true >( + z, m, x, beta, ring.template getZero< typename Ring::D4 >(), ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename InputType1, + typename InputType2, + typename OutputType, + typename MaskType, + typename Coords + > + RC eWiseMul( + Vector< OutputType, ascend, Coords > &z, + const Vector< MaskType, ascend, Coords > &m, + const InputType1 alpha, + const InputType2 beta, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< MaskType >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D1, InputType1 >::value ), + "grb::eWiseMul", + "called with a left-hand side input vector with element type that does not " + "match the first domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D2, InputType2 >::value ), + "grb::eWiseMul", + "called with a right-hand side input vector with element type that does " + "not match the second domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Ring::D3, OutputType >::value ), + "grb::eWiseMul", + "called with an output vector with element type that does not match the " + "third domain of the given semiring" ); + NO_CAST_OP_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), + "grb::eWiseMulAdd", + "called with a mask vector _m with a non-bool element type" ); + + // check for empty mask + if( size( m ) == 0 ) { + return eWiseMul< descr >( z, alpha, beta, ring, phase ); + } + + // dynamic checks + const size_t n = size( z ); + if( size( m ) != n ) { return MISMATCH; } + + // check for trivial phase + if( phase == RESIZE ) { + return SUCCESS; + } + + // check trivial + if( alpha == ring.template getZero< typename Ring::D1 >() ) { + return SUCCESS; + } + if( beta == ring.template getZero< typename Ring::D2 >() ) { + return SUCCESS; + } + +#ifdef _DEBUG + std::cout << "eWiseMul (ascend, masked) dispatches to masked foldl\n"; +#endif + typename Ring::D3 temp; + const RC always_success = apply( temp, alpha, beta, + ring.getMultiplicativeOperator() ); + assert( always_success == SUCCESS ); +#ifdef NDEBUG + (void) always_success; +#endif + return foldl< descr >( z, m, temp, ring.getAdditiveMonoid(), EXECUTE ); + } + + // internal namespace for implementation of grb::dot + namespace internal { + + template< + Descriptor descr, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_x, + bool already_dense_input_y, +#endif + class AddMonoid, + class AnyOp, + typename InputType1, + typename InputType2, + typename Coords + > + RC sparse_dot_generic( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_input_x, + bool already_dense_input_y, +#endif + typename AddMonoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_x, + const Coords &local_y, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const size_t local_nz, + const AddMonoid &addMonoid, + const AnyOp &anyOp + ) { +#ifdef _DEBUG + std::cout << "\t\t in sparse variant, nonzero range " << lower_bound << "--" + << upper_bound << ", blocksize " << AnyOp::blocksize << "\n"; +#else + (void) upper_bound; +#endif + + // get raw alias + const InputType1 * __restrict__ a = internal::getRaw( x ); + const InputType2 * __restrict__ b = internal::getRaw( y ); + + size_t i = 0; + if( local_nz > 0 ) { + while( i + AnyOp::blocksize < local_nz ) { + // declare buffers + static_assert( AnyOp::blocksize > 0, + "Configuration error: vectorisation blocksize set to 0!" ); + typename AnyOp::D1 xx[ AnyOp::blocksize ]; + typename AnyOp::D2 yy[ AnyOp::blocksize ]; + typename AnyOp::D3 zz[ AnyOp::blocksize ]; + bool mask[ AnyOp::blocksize ]; + + // prepare registers + for( size_t k = 0; k < AnyOp::blocksize; ++k, ++i ) { + mask[ k ] = already_dense_input_x || + local_x.assigned( already_dense_input_y ? i : local_y.index( i ) ); + } + + // rewind + i -= AnyOp::blocksize; + + // do masked load + for( size_t k = 0; k < AnyOp::blocksize; ++k, ++i ) { + if( mask[ k ] ) { + xx[ k ] = static_cast< typename AnyOp::D1 >( + a[ ( already_dense_input_y ? i : local_y.index( i ) ) + lower_bound ] ); + yy[ k ] = static_cast< typename AnyOp::D2 >( + b[ ( already_dense_input_y ? i : local_y.index( i ) ) + lower_bound ] ); + } + } + + // perform element-wise multiplication + if( internal::maybe_noop< AnyOp >::value ) { + // we are forced to first initialise zz before doing masked apply + for( size_t k = 0; k < AnyOp::blocksize; ++k ) { + zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >(); + } + for( size_t k = 0; k < AnyOp::blocksize; ++k ) { + if( mask[ k ] ) { + GRB_UTIL_IGNORE_MAYBE_UNINITIALIZED // yy and xx cannot be used + // uninitialised or mask + apply( zz[ k ], xx[ k ], yy[ k ], anyOp ); // would be false while zz + GRB_UTIL_RESTORE_WARNINGS // init is just above + } + } + } else { + // if apply surely initialises zz, we could use a blend-like op + for( size_t k = 0; k < AnyOp::blocksize; ++k ) { + if( mask[ k ] ) { + apply( zz[ k ], xx[ k ], yy[ k ], anyOp ); + } else { + zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >(); + } + } + } + + // perform reduction into output element + addMonoid.getOperator().foldlArray( thread_local_output, zz, + AnyOp::blocksize ); + //^--> note that this foldl operates on raw arrays, + // and thus should not be mistaken with a foldl + // on a grb::Vector. + } + + // perform element-by-element updates for remainder (if any) + for( ; i < local_nz; ++i ) { + typename AddMonoid::D3 temp = + addMonoid.template getIdentity< typename AddMonoid::D3 >(); + const size_t index = ( already_dense_input_y ? i : local_y.index( i ) ) + + lower_bound; + if( already_dense_input_x || local_x.assigned( index - lower_bound ) ) { + apply( temp, a[ index ], b[ index ], anyOp ); + foldr( temp, thread_local_output, addMonoid.getOperator() ); + } + } + } + + assert( false ); + return UNSUPPORTED; + } + + template< + Descriptor descr = descriptors::no_operation, + class AddMonoid, + class AnyOp, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC dot_generic( + OutputType &z, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const AddMonoid &addMonoid, + const AnyOp &anyOp, + const Phase &phase + ) { + const size_t n = internal::getCoordinates( x ).size(); + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + const size_t start = 0; + const size_t end = n; + + if( end > start ) { + + typename AddMonoid::D3 reduced = + addMonoid.template getIdentity< typename AddMonoid::D3 >(); + + size_t reduced_size = sysconf( _SC_NPROCESSORS_ONLN ) * + config::CACHE_LINE_SIZE::value(); + typename AddMonoid::D3 array_reduced[ reduced_size ]; + + for( + size_t i = 0; + i < reduced_size; + i += config::CACHE_LINE_SIZE::value() + ) { + array_reduced[ i ] = + addMonoid.template getIdentity< typename AddMonoid::D3 >(); + } + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = + [&x, &y, &addMonoid, &anyOp, &array_reduced] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage dot-generic in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + size_t local_y_nz = local_n; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + bool already_dense_input_x = true; + bool already_dense_input_y = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( + lower_bound, upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + unsigned int thread_id = + omp_get_thread_num() * config::CACHE_LINE_SIZE::value(); + + if( sparse ) { + if( local_x_nz < local_y_nz ) { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_dot_generic< +#else + rc = internal::sparse_dot_generic< +#endif + descr, AddMonoid, AnyOp, InputType1, InputType2, Coords + >( + already_dense_input_x, already_dense_input_y, + array_reduced[ thread_id ], + lower_bound, upper_bound, + local_x, local_y, + x, y, + local_x_nz, + addMonoid, anyOp + ); + } else { +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_sparse_dot_generic< +#else + rc = internal::sparse_dot_generic< +#endif + descr, AddMonoid, AnyOp, InputType1, InputType2, Coords + >( + already_dense_input_y, already_dense_input_x, + array_reduced[ thread_id ], + lower_bound, upper_bound, + local_y, local_x, x, y, local_y_nz, + addMonoid, anyOp + ); + } + } else { + // get raw alias + const InputType1 * __restrict__ a = internal::getRaw( x ); + const InputType2 * __restrict__ b = internal::getRaw( y ); + + size_t i = lower_bound; + if( upper_bound > lower_bound ) { + while( i + AnyOp::blocksize < upper_bound ) { + // declare buffers + static_assert( AnyOp::blocksize > 0, + "Configuration error: vectorisation blocksize set to 0!" ); + + typename AnyOp::D1 xx[ AnyOp::blocksize ]; + typename AnyOp::D2 yy[ AnyOp::blocksize ]; + typename AnyOp::D3 zz[ AnyOp::blocksize ]; + + // prepare registers + for( size_t k = 0; k < AnyOp::blocksize; ++k ) { + xx[ k ] = static_cast< typename AnyOp::D1 >( a[ i ] ); + yy[ k ] = static_cast< typename AnyOp::D2 >( b[ i++ ] ); + } + + // perform element-wise multiplication + if( internal::maybe_noop< AnyOp >::value ) { + for( size_t k = 0; k < AnyOp::blocksize; ++k ) { + zz[ k ] = addMonoid.template getIdentity< typename AnyOp::D3 >(); + } + } + for( size_t k = 0; k < AnyOp::blocksize; ++k ) { + apply( zz[ k ], xx[ k ], yy[ k ], anyOp ); + } + + // perform reduction into output element + addMonoid.getOperator().foldlArray( array_reduced[ thread_id ], zz, + AnyOp::blocksize ); + //^--> note that this foldl operates on raw arrays, + // and thus should not be mistaken with a foldl + // on a grb::Vector. +#ifdef _DEBUG + std::cout << "\t\t " << ( i - AnyOp::blocksize ) << "--" << i << ": " + << "running reduction = " << array_reduced[ thread_id ] << "\n"; +#endif + } + + // perform element-by-element updates for remainder (if any) + for( ; i < upper_bound; ++i ) { + OutputType temp = addMonoid.template getIdentity< OutputType >(); + apply( temp, a[ i ], b[ i ], anyOp ); + foldr( temp, array_reduced[ thread_id ], addMonoid.getOperator() ); + } + } + } + + // the local coordinates for the input vectors have not been updated as + // they are read-only therefore, we don't need to invoke asyncJoinSubset; + // the output is a scalar + return rc; + }; + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: dot-generic" << std::endl; +#endif + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_DOT_GENERIC, + end, sizeof( OutputType ), dense_descr, true, + nullptr, nullptr, nullptr, nullptr, + &x, &y, nullptr, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( y ), + nullptr, nullptr, + nullptr + ); + + for( + size_t i = 0; + i < reduced_size; + i += config::CACHE_LINE_SIZE::value() + ) { + foldl( reduced, array_reduced[ i ], addMonoid.getOperator() ); + } + + // write back result + z = static_cast< OutputType >( reduced ); + } else { + // this has been tested by the unittest + } + +#ifdef _DEBUG + std::cout << "\t returning " << z << "\n"; +#endif + // done! + return ret; + } + + } // namespace internal + + template< + Descriptor descr = descriptors::no_operation, + class AddMonoid, + class AnyOp, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC dot( + OutputType &z, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const AddMonoid &addMonoid = AddMonoid(), + const AnyOp &anyOp = AnyOp(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< AddMonoid >::value && + grb::is_operator< AnyOp >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType1, typename AnyOp::D1 >::value ), "grb::dot", + "called with a left-hand vector value type that does not match the first " + "domain of the given multiplicative operator" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType2, typename AnyOp::D2 >::value ), "grb::dot", + "called with a right-hand vector value type that does not match the second " + "domain of the given multiplicative operator" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename AddMonoid::D3, typename AnyOp::D1 >::value ), + "grb::dot", + "called with a multiplicative operator output domain that does not match " + "the first domain of the given additive operator" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< OutputType, typename AddMonoid::D2 >::value ), "grb::dot", + "called with an output vector value type that does not match the second " + "domain of the given additive operator" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename AddMonoid::D3, typename AddMonoid::D2 >::value ), + "grb::dot", + "called with an additive operator whose output domain does not match its " + "second input domain" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< OutputType, typename AddMonoid::D3 >::value ), "grb::dot", + "called with an output vector value type that does not match the third " + "domain of the given additive operator" ); + +#ifdef _DEBUG + std::cout << "In grb::dot (ascend). " + << "I/O scalar on input reads " << z << "\n"; +#endif + + // dynamic sanity check + const size_t n = internal::getCoordinates( y ).size(); + if( internal::getCoordinates( x ).size() != n ) { + return MISMATCH; + } + +#ifdef _DEBUG + std::cout << "\t dynamic checks pass\n"; +#endif + + // dot will be computed out-of-place here. A separate field is needed because + // of possible multi-threaded computation of the dot. + OutputType oop = addMonoid.template getIdentity< OutputType >(); + + RC ret = SUCCESS; + + ret = internal::dot_generic< descr >( oop, x, y, addMonoid, anyOp, phase ); + + // fold out-of-place dot product into existing input, and exit +#ifdef _DEBUG + std::cout << "\t dot_generic returned " << oop << ", " + << "which will be folded into " << z << " " + << "using the additive monoid\n"; +#endif + ret = ret ? ret : foldl( z, oop, addMonoid.getOperator() ); +#ifdef _DEBUG + std::cout << "\t returning " << z << "\n"; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename IOType, + typename InputType1, + typename InputType2, + typename Coords + > + RC dot( + IOType &x, + const Vector< InputType1, ascend, Coords > &left, + const Vector< InputType2, ascend, Coords > &right, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< IOType >::value && + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { +#ifdef _DEBUG + std::cout << "In grb::dot (ascend, semiring version)\n" + << "\t dispatches to monoid-operator version\n"; +#endif + return grb::dot< descr >( x, left, right, ring.getAdditiveMonoid(), + ring.getMultiplicativeOperator(), phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + typename Func, + typename DataType, + typename Coords + > + RC eWiseMap( const Func f, Vector< DataType, ascend, Coords > &x ) { + + RC ret = SUCCESS; + + const size_t n = internal::getCoordinates( x ).size(); + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [f, &x] ( + internal::Pipeline &pipeline, const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseMap(f, x) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + bool sparse = false; + + bool already_dense_input_x = true; + + if( !dense_descr ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_input_x ) { +#else + already_dense_input_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { + // the sparse case is possible only when the local coordinates are already + // initialized + assert( already_dense_input_x == false ); + for( size_t k = 0; k < local_x_nz; ++k ) { + DataType &xval = internal::getRaw( x )[ local_x.index( k ) + lower_bound ]; + xval = f( xval ); + } + } else { + for( size_t i = lower_bound; i < upper_bound; ++i ) { + DataType &xval = internal::getRaw( x )[ i ]; + xval = f( xval ); + } + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_EWISEMAP, + n, sizeof( DataType ), dense_descr, true, + &x, nullptr, &internal::getCoordinates( x ), nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseMap(f, x)" << std::endl; +#endif + return ret; + } + + namespace internal { + + template< + Descriptor descr = descriptors::no_operation, + typename Func, + typename DataType1, + typename DataType2, + typename Coords, + typename... Args + > + RC eWiseLambda_helper( + std::vector< const void * > all_vectors_ptr, + size_t maximum_data_type_size, + const Func f, + const Vector< DataType1, ascend, Coords > &x, + const Vector< DataType2, ascend, Coords > &y, + Args const &... args + ) { + // catch mismatch + if( size( x ) != size( y ) ) { + return MISMATCH; + } + + all_vectors_ptr.push_back( &y ); + maximum_data_type_size = std::max( maximum_data_type_size, sizeof( DataType2 ) ); + + // continue + return eWiseLambda_helper( all_vectors_ptr, maximum_data_type_size, f, x, + args... ); + } + + template< + Descriptor descr = descriptors::no_operation, + typename Func, + typename DataType, + typename Coords + > + RC eWiseLambda_helper( + std::vector< const void * > all_vectors_ptr, + size_t maximum_data_type_size, + const Func f, + const Vector< DataType, ascend, Coords > &x + ) { + // all pointers, except one, have been stored, and the last one will be + // stored by the normal eWiseLambda + return eWiseLambda< descr, Func, DataType, Coords >( f, x, all_vectors_ptr, + maximum_data_type_size ); + } + }; + + template< + Descriptor descr = descriptors::no_operation, + typename Func, + typename DataType1, + typename DataType2, + typename Coords, + typename... Args + > + RC eWiseLambda( + const Func f, + const Vector< DataType1, ascend, Coords > &x, + const Vector< DataType2, ascend, Coords > &y, + Args const &... args + ) { + + // create an empty vector to store pointers for all vectors passed to + // eWiseLambda + std::vector< const void * > all_vectors_ptr; + + // invoke the helper function to store the pointers + return internal::eWiseLambda_helper( all_vectors_ptr, 0, f, x, y, args...); + } + + template< + Descriptor descr = descriptors::no_operation, + typename Func, + typename DataType, + typename Coords + > + RC eWiseLambda( + const Func f, + const Vector< DataType, ascend, Coords > &x, + std::vector< const void * > all_vectors_ptr = std::vector< const void *>(), + size_t maximum_data_type_size = 0 + ) { +#ifdef _DEBUG + std::cout << "Info: entering eWiseLambda function on vectors.\n"; +#endif + + all_vectors_ptr.push_back( &x ); + maximum_data_type_size = + std::max( maximum_data_type_size, sizeof( DataType ) ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [f, &x] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage eWiseLambda in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + Coords local_x; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( local_x_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( sparse ) { + if ( already_dense_output ) { + for( size_t k = 0; k < local_x_nz; ++k ) { + f( k + lower_bound ); + } + } else { + for( size_t k = 0; k < local_x_nz; ++k ) { + const size_t i = local_x.index( k ) + lower_bound; + f( i ); + } + } + } else { + for (size_t i = lower_bound; i < upper_bound; i++) { + f( i ); + } + } + + // the local coordinates for all vectors of eWiseLambda cannot change + // therefore, we don't need to invoke asyncJoinSubset for any of them + + return SUCCESS; + }; + + // eWiseLambda is a special case as we don't know which of the accessed + // vectors are read-only therefore, we assume that all vectors may be written, + // but the sparsity structure cannot change i.e., the coordinates of each + // vector cannot be updated, but we pass the coordinates of x for the loop + // size + ret = ret ? ret : internal::le.addeWiseLambdaStage( + std::move( func ), + internal::Opcode::BLAS1_EWISELAMBDA, + internal::getCoordinates( x ).size(), maximum_data_type_size, dense_descr, + all_vectors_ptr, &internal::getCoordinates( x ) + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: eWiseLambda" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename InputType, + typename IOType, + typename MaskType, + typename Coords + > + RC foldl( + IOType &x, + const Vector< InputType, ascend, Coords > &y, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid = Monoid(), + const typename std::enable_if< !grb::is_object< IOType >::value && + !grb::is_object< InputType >::value && + !grb::is_object< MaskType >::value && + grb::is_monoid< Monoid >::value, void + >::type * const = nullptr + ) { +#ifdef _DEBUG + std::cout << "foldl: IOType <- [InputType] with a monoid called. " + << "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. " + << "It has a mask of size " << size( mask ) << " with " << nnz( mask ) + << " nonzeroes.\n"; +#endif + + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< IOType, InputType >::value ), "grb::foldl", + "called with a scalar IO type that does not match the input vector type" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D1 >::value ), "grb::foldl", + "called with an input vector value type that does not match the first " + "domain of the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D2 >::value ), "grb::foldl", + "called with an input vector type that does not match the second domain of " + "the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D3 >::value ), "grb::foldl", + "called with an input vector type that does not match the third domain of " + "the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< bool, MaskType >::value ), "grb::foldl", + "called with a vector mask type that is not boolean" ); + + if( size( mask ) > 0 ) { + return internal::template fold_from_vector_to_scalar_generic< + descr, true, true + >( x, y, mask, monoid ); + } else { + return internal::template fold_from_vector_to_scalar_generic< + descr, false, true + >( x, y, mask, monoid ); + } + } + + template< + Descriptor descr = descriptors::no_operation, + class Monoid, + typename IOType, + typename InputType, + typename Coords + > + RC foldl( + IOType &x, + const Vector< InputType, ascend, Coords > &y, + const Monoid &monoid = Monoid(), + const typename std::enable_if< + !grb::is_object< IOType >::value && + !grb::is_object< InputType >::value && + grb::is_monoid< Monoid >::value, void + >::type * const = nullptr + ) { +#ifdef _DEBUG + std::cout << "foldl: IOType <- [InputType] with a monoid called. " + << "Array has size " << size( y ) << " with " << nnz( y ) << " nonzeroes. " + << "It has no mask.\n"; +#endif + + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< IOType, InputType >::value ), "grb::reduce", + "called with a scalar IO type that does not match the input vector type" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D1 >::value ), "grb::reduce", + "called with an input vector value type that does not match the first " + "domain of the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D2 >::value ), "grb::reduce", + "called with an input vector type that does not match the second domain of " + "the given monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, typename Monoid::D3 >::value ), "grb::reduce", + "called with an input vector type that does not match the third domain of " + "the given monoid" ); + + // do reduction + Vector< bool, ascend, Coords > empty_mask( 0 ); + return internal::template fold_from_vector_to_scalar_generic< + descr, false, true + >( x, y, empty_mask, monoid ); + } + + template< + Descriptor descr = descriptors::no_operation, + typename T, + typename U, + typename Coords + > + RC zip( + Vector< std::pair< T, U >, ascend, Coords > &z, + const Vector< T, ascend, Coords > &x, + const Vector< U, ascend, Coords > &y, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< T >::value && + !grb::is_object< U >::value, + void >::type * const = nullptr + ) { + const size_t n = size( z ); + if( n != size( x ) ) { + return MISMATCH; + } + if( n != size( y ) ) { + return MISMATCH; + } + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + const T * const x_raw = internal::getRaw( x ); + const U * const y_raw = internal::getRaw( y ); + std::pair< T, U > * z_raw = internal::getRaw( z ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [&z, x_raw, y_raw, z_raw] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + std::cout << "\t\tExecution of stage zip(z, x, y) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_z; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + bool already_dense_output = true; +#else + (void) pipeline; +#endif + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( z ) ); + if( !dense_descr && !already_dense_output ) { +#else + if( !dense_descr ) { +#endif + local_z = internal::getCoordinates( z ).asyncSubset( lower_bound, + upper_bound ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !dense_descr && !already_dense_output ) { +#else + if( !dense_descr ) { +#endif + // the result will always be dense + local_z.local_assignAllNotAlreadyAssigned(); + } + + for( size_t i = lower_bound; i < upper_bound; ++i ) { + z_raw[ i ].first = x_raw[ i ]; + z_raw[ i ].second = y_raw[ i ]; + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !dense_descr && !already_dense_output ) { +#else + if( !dense_descr ) { +#endif + internal::getCoordinates( z ).asyncJoinSubset( local_z, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_ZIP, + internal::getCoordinates( x ).size(), sizeof( T ) + sizeof( U ), + dense_descr, true, + &z, nullptr, &internal::getCoordinates( z ), nullptr, + &x, &y, nullptr, nullptr, + &internal::getCoordinates( x ), &internal::getCoordinates( y ), + nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: zip(z, x, y)" << std::endl; +#endif + assert( false ); + return UNSUPPORTED; + } + + template< + Descriptor descr = descriptors::no_operation, + typename T, + typename U, + typename Coords + > + RC unzip( + Vector< T, ascend, Coords > &x, + Vector< U, ascend, Coords > &y, + const Vector< std::pair< T, U >, ascend, Coords > &in, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< T >::value && + !grb::is_object< U >::value, + void >::type * const = nullptr + ) { + const size_t n = size( in ); + if( n != size( x ) ) { + return MISMATCH; + } + if( n != size( y ) ) { + return MISMATCH; + } + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + T * const x_raw = internal::getRaw( x ); + U * const y_raw = internal::getRaw( y ); + const std::pair< T, U > * in_raw = internal::getRaw( in ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [&x, &y, x_raw, y_raw, in_raw] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage unzip(x, y, in) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_y; + + bool already_dense_output_x = true; + bool already_dense_output_y = true; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output_x = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !dense_descr && !already_dense_output_x ) { +#else + if( !dense_descr ) { + already_dense_output_x = false; +#endif + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x.local_assignAllNotAlreadyAssigned(); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !dense_descr && !already_dense_output_y ) { +#else + if( !dense_descr ) { + already_dense_output_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y.local_assignAllNotAlreadyAssigned(); + } + + for( size_t i = lower_bound; i < upper_bound; ++i ) { + x_raw[ i ] = in_raw[ i ].first; + y_raw[ i ] = in_raw[ i ].second; + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !dense_descr && !already_dense_output_x ) { +#else + if( !dense_descr ) { +#endif + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !dense_descr && !already_dense_output_y ) { +#else + if( !dense_descr ) { +#endif + internal::getCoordinates( y ).asyncJoinSubset( local_y, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS1_UNZIP, + internal::getCoordinates( x ).size(), std::max( sizeof( T ), sizeof( U ) ), + dense_descr, true, + &x, &y, + &internal::getCoordinates( x ), &internal::getCoordinates( y ), + &in, nullptr, nullptr, nullptr, + &internal::getCoordinates( in ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: unzip(x, y, in)" << std::endl; +#endif + assert( false ); + return UNSUPPORTED; + } + +/** @} */ +// ^-- ends BLAS-1 NB module + +} // end namespace ``grb'' + +#undef NO_CAST_ASSERT +#undef NO_CAST_OP_ASSERT + +#endif // end `_H_GRB_ASCEND_BLAS1' + diff --git a/include/graphblas/ascend/blas2.hpp b/include/graphblas/ascend/blas2.hpp new file mode 100644 index 000000000..8f764bf8d --- /dev/null +++ b/include/graphblas/ascend/blas2.hpp @@ -0,0 +1,1552 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Defines the Ascend level-2 primitives + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_BLAS2 +#define _H_GRB_ASCEND_BLAS2 + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#include "coordinates.hpp" +#include "forward.hpp" +#include "matrix.hpp" +#include "vector.hpp" +#include "boolean_dispatcher_blas2.hpp" + +#ifdef _DEBUG + #include "spmd.hpp" +#endif + +#define NO_CAST_ASSERT( x, y, z ) \ + static_assert( x, \ + "\n\n" \ + "****************************************************************" \ + "****************************************************************" \ + "**************************************\n" \ + "* ERROR | " y " " z ".\n" \ + "****************************************************************" \ + "****************************************************************" \ + "**************************************\n" \ + "* Possible fix 1 | Remove no_casting from the template " \ + "parameters in this call to " y ".\n" \ + "* Possible fix 2 | Provide objects with element types or " \ + "domains that match the expected type.\n" \ + "****************************************************************" \ + "****************************************************************" \ + "**************************************\n" ); + + +namespace grb { + + namespace internal { + + extern LazyEvaluation le; + + } +} + +namespace grb { + + /** + * \addtogroup ascend + * @{ + */ + + // put the generic mxv implementation in an internal namespace + namespace internal { + + template< + bool output_dense, + bool left_handed, + class AdditiveMonoid, + class Multiplication, + template< typename > class One, + typename IOType, + typename InputType, + typename SourceType, + typename Coords + > + class addIdentityDuringMV< + ascend, true, output_dense, left_handed, + AdditiveMonoid, Multiplication, One, + IOType, InputType, SourceType, Coords + > { + + public: + + static void apply( + Vector< IOType, ascend, Coords > &destination_vector, + IOType * __restrict__ const &destination, + const size_t &destination_range, + const size_t &source_index, + const AdditiveMonoid &add, + const Multiplication &mul, + const SourceType &input_element, + const std::function< size_t( size_t ) > &src_local_to_global, + const std::function< size_t( size_t ) > &dst_global_to_local + ) { + + } + }; + + template< + Descriptor descr, + bool masked, + bool input_masked, + bool left_handed, + template< typename > class One, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_destination_vector, + bool already_dense_mask_vector, +#endif + class AdditiveMonoid, + class Multiplication, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename InputType4, + typename Coords, + typename RowColType, + typename NonzeroType + > + inline void vxm_inner_kernel_gather( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_destination_vector, + bool already_dense_mask_vector, +#endif + RC &rc, + const size_t lower_bound, + Coords &local_destination_vector, + const Coords &local_mask_vector, + Vector< IOType, ascend, Coords > &destination_vector, + IOType &destination_element, + const size_t &destination_index, + const Vector< InputType1, ascend, Coords > &source_vector, + const InputType1 * __restrict__ const &source, + const size_t &source_range, + const internal::Compressed_Storage< + InputType2, RowColType, NonzeroType + > &matrix, + const Vector< InputType3, ascend, Coords > &mask_vector, + const InputType3 * __restrict__ const &mask, + const Vector< InputType4, ascend, Coords > &source_mask_vector, + const InputType4 * __restrict__ const &source_mask, + const AdditiveMonoid &add, + const Multiplication &mul, + const std::function< size_t( size_t ) > &src_local_to_global, + const std::function< size_t( size_t ) > &src_global_to_local, + const std::function< size_t( size_t ) > &dst_local_to_global + ) { +#ifndef _DEBUG + (void) destination_vector; +#endif + constexpr bool add_identity = descr & descriptors::add_identity; + constexpr bool dense_hint = descr & descriptors::dense; + constexpr bool explicit_zero = descr & descriptors::explicit_zero; +#ifdef _DEBUG + constexpr bool use_index = descr & descriptors::use_index; +#endif + assert( rc == SUCCESS ); + + // check whether we should compute output here + if( masked ) { + if( already_dense_mask_vector ) { + if( !internal::getCoordinates( mask_vector ).template + mask< descr >( destination_index, mask ) + ) { +#ifdef _DEBUG + std::cout << "Masks says to skip processing destination index " << + destination_index << "\n"; +#endif + return; + } + } else { + if( !local_mask_vector.template + mask< descr >( destination_index - lower_bound, mask ) + ) { +#ifdef _DEBUG + std::cout << "Masks says to skip processing destination index " << + destination_index << "\n"; +#endif + return; + } + } + } + + // take shortcut, if possible + if( grb::has_immutable_nonzeroes< AdditiveMonoid >::value && ( + already_dense_destination_vector || + local_destination_vector.assigned( destination_index - lower_bound ) + ) && destination_element != add.template getIdentity< IOType >() + ) { + return; + } + + // start output + typename AdditiveMonoid::D3 output = + add.template getIdentity< typename AdditiveMonoid::D3 >(); + bool set = false; + + // if we need to add identity, do so first: + if( add_identity ) { + const size_t id_location = src_global_to_local( dst_local_to_global( + destination_index ) ); + // the SpMV primitive may access non-local elements, and thus referring to + // the input vector by using local coordinates is incorrect + // the input vector of an SpMV cannot be updated, i.e., written, by another + // primitive executed in the same pipeline with the current SpMV + // therefore, in the current design, it's safe to use global coordinates for + // the input vector + if( ( !input_masked || + internal::getCoordinates( source_mask_vector ).template + mask< descr >( id_location, source_mask ) + ) && id_location < source_range + ) { + if( dense_hint || internal::getCoordinates( source_vector ).assigned( id_location ) ) { + typename AdditiveMonoid::D1 temp; + internal::CopyOrApplyWithIdentity< + !left_handed, typename AdditiveMonoid::D1, InputType1, One + >::set( temp, source_vector[ id_location ], mul ); + internal::CopyOrApplyWithIdentity< + false, typename AdditiveMonoid::D3, typename AdditiveMonoid::D1, + AdditiveMonoid::template Identity + >::set( output, temp, add ); + set = true; + } + } + } + + // handle row or column at destination_index + // NOTE: This /em could be parallelised, but will probably only slow things + // down +#ifdef _DEBUG + std::cout << "vxm_gather: processing destination index " << destination_index << " / " + << internal::getCoordinates( destination_vector ).size() + << ". Input matrix has " << ( matrix.col_start[ destination_index + 1 ] - + matrix.col_start[ destination_index ] ) << " nonzeroes.\n"; +#endif + for( + size_t k = matrix.col_start[ destination_index ]; + rc == SUCCESS && + k < static_cast< size_t >( matrix.col_start[ destination_index + 1 ] ); + ++k + ) { + // declare multiplication output field + typename Multiplication::D3 result = + add.template getIdentity< typename AdditiveMonoid::D3 >(); + // get source index + const size_t source_index = matrix.row_index[ k ]; + // check mask + if( input_masked && + !internal::getCoordinates( source_mask_vector ).template + mask< descr >( source_index, source_mask ) + ) { +#ifdef _DEBUG + std::cout << "\t vxm_gather: skipping source index " << source_index + << " due to input mask\n"; +#endif + continue; + } + // check for sparsity at source + if( !dense_hint ) { + if( !internal::getCoordinates( source_vector ).assigned( source_index ) ) { +#ifdef _DEBUG + std::cout << "\t vxm_gather: Skipping out of computation with source " + << "index " << source_index << " since it does not contain a nonzero\n"; +#endif + continue; + } + } + // get nonzero + typedef typename std::conditional< + left_handed, + typename Multiplication::D2, + typename Multiplication::D1 + >::type RingNonzeroType; + const RingNonzeroType nonzero = + matrix.template getValue( k, One< RingNonzeroType >::value() ); +#ifdef _DEBUG + std::cout << "\t vxm_gather: interpreted nonzero is " << nonzero << ", " + << "which is the " << k << "-th nonzero and has source index " + << source_index << "\n"; +#endif + // check if we use source element or whether we use its index value instead + typedef typename std::conditional< + left_handed, + typename Multiplication::D1, + typename Multiplication::D2 + >::type SourceType; + const SourceType apply_source = internal::ValueOrIndex< + descr, SourceType, InputType1 + >::getFromArray( source, src_local_to_global, source_index ); +#ifdef _DEBUG + if( use_index ) { + std::cout << "\t vxm_gather (use_index descriptor): apply( output, matrix " + << "nonzero, vector nonzero, * ) = apply( "; + } else { + std::cout << "\t vxm_gather: apply( output, matrix nonzero, vector " + << "nonzero, * ) = apply( "; + } + std::cout << " output, " << nonzero << ", " << source << ", * )\n"; +#endif + //multiply + internal::leftOrRightHandedMul< + left_handed, typename Multiplication::D3, + SourceType, RingNonzeroType, Multiplication + >::mul( result, apply_source, nonzero, mul ); +#ifdef _DEBUG + std::cout << "\t vxm_gather: output (this nonzero) = " << result << "\n"; +#endif + + // accumulate +#ifdef _DEBUG + std::cout << "\t vxm_gather: foldr( " << result << ", " << output + << ", + );\n"; +#endif + rc = foldr( result, output, add.getOperator() ); +#ifdef _DEBUG + std::cout << "\t vxm_gather: output (sum at destination) = " << output + << "\n"; +#endif + set = true; + + // sanity check (but apply cannot fail) + assert( rc == SUCCESS ); + } + +#ifdef _DEBUG + if( set ) { + std::cout << "\t vxm_gather: local contribution to this output element at " + << "index " << destination_index << " will be " << output << " " + << "and this corresponds to an explicitly set nonzero.\n"; + } else { + std::cout << "\t vxm_gather: local contribution to this output element at " + << "index " << destination_index << " will be " << output << " and this " + << "is an unset value.\n"; + if( already_dense_destination_vector || + local_destination_vector.assigned( destination_index - lower_bound ) + ) { + std::cout << "\t(old value " << destination_element << " will remain " + << "unmodified.)\n"; + } else { + std::cout << "\t(no old value existed so the output vector will remain " + << "unset at this index.)\n"; + } + } +#endif + // finally, accumulate in output + if( explicit_zero || set ) { +#ifdef _DEBUG + std::cout << "\taccumulating " << output << " into output vector...\n"; +#endif + if( already_dense_destination_vector || + local_destination_vector.assign( destination_index - lower_bound ) + ) { +#ifdef _DEBUG + std::cout << "\tfoldl( " << destination_element << ", " << output << ", " + << "add.getOperator() );, destination_element = "; +#endif + rc = foldl( destination_element, output, add.getOperator() ); +#ifdef _DEBUG + std::cout << destination_element << "\n"; +#endif + } else { +#ifdef _DEBUG + std::cout << "\toutput vector element was previously not set. Old " + << "(possibly uninitialised value) " << destination_element << " will " + << "now be set to " << output << ", result (after, possibly, casting): "; +#endif + destination_element = static_cast< IOType >( output ); +#ifdef _DEBUG + std::cout << destination_element << "\n"; +#endif + } + } + } + + template< + Descriptor descr, + bool masked, + bool input_masked, + bool left_handed, + bool using_semiring, + template< typename > class One, + class AdditiveMonoid, + class Multiplication, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename InputType4, + typename RIT, + typename CIT, + typename NIT, + typename Coords + > + RC vxm_generic( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &v, + const Vector< InputType4, ascend, Coords > &v_mask, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const AdditiveMonoid &add, + const Multiplication &mul, + const Phase &phase, + const std::function< size_t( size_t ) > row_l2g, + const std::function< size_t( size_t ) > row_g2l, + const std::function< size_t( size_t ) > col_l2g, + const std::function< size_t( size_t ) > col_g2l + ) { + // type sanity checking + NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE || + !(descr & descriptors::no_casting) || + std::is_same< InputType3, bool >::value + ), "vxm (any variant)", + "Mask type is not boolean" ); + NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE || + !(descr & descriptors::no_casting) || + !left_handed || + std::is_same< InputType1, typename Multiplication::D1 >::value + ), "vxm (any variant)", + "Input vector type does not match multiplicative operator first " + "input domain" ); + NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE || + !(descr & descriptors::no_casting) || + left_handed || + std::is_same< InputType2, typename Multiplication::D1 >::value + ), "vxm (any variant)", + "Input vector type does not match multiplicative operator second " + "input domain" ); + NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE || + !(descr & descriptors::no_casting) || + !left_handed || + std::is_same< InputType2, typename Multiplication::D2 >::value + ), "vxm (any variant)", + "Input matrix type does not match multiplicative operator second " + "input domain" ); + NO_CAST_ASSERT( ( descr > internal::MAX_DESCRIPTOR_VALUE || + !(descr & descriptors::no_casting) || + left_handed || + std::is_same< InputType1, typename Multiplication::D2 >::value + ), "vxm (any variant)", + "Input matrix type does not match multiplicative operator first " + "input domain" ); + + RC ret = SUCCESS; + +#ifdef _DEBUG + const auto s = spmd< ascend >::pid(); + std::cout << s << ": ascend vxm called with a " + << descriptors::toString( descr ) << "\n"; +#endif + + // get input and output vector sizes + const size_t m = internal::getCoordinates( u ).size(); + const size_t n = internal::getCoordinates( v ).size(); + + // get whether the matrix should be transposed prior to execution of this + // vector-times-matrix operation + constexpr bool transposed = descr & descriptors::transpose_matrix; + + // check for dimension mismatch + if( ( transposed && ( n != ncols( A ) || m != nrows( A ) ) ) + || ( !transposed && ( n != nrows( A ) || m != ncols( A ) ) ) ) { +#ifdef _DEBUG + std::cout << "Mismatch of columns ( " << n << " vs. " << ncols( A ) + << " ) or rows ( " << m << " vs. " << nrows( A ) << " ) with " + << "transposed value " << ((int)transposed) << "\n"; +#endif + return MISMATCH; + } + + // check density + if( descr & descriptors::dense ) { + // it's safe to check the number of nonzeroes for the input vector and its + // mask since both of them are read-only in the current design for + // ascend execution + if( nnz( v ) < size( v ) ) { +#ifdef _DEBUG + std::cout << "\t Dense descriptor given but input vector was sparse\n"; +#endif + return ILLEGAL; + } + if( size( v_mask ) > 0 && nnz( v_mask ) < size( v_mask ) ) { +#ifdef _DEBUG + std::cout << "\t Dense descriptor given but input mask has sparse " + << "structure\n"; +#endif + return ILLEGAL; + } + } + + // check mask + if( masked ) { + if( (transposed && internal::getCoordinates( mask ).size() != nrows( A ) ) || + ( !transposed && internal::getCoordinates( mask ).size() != ncols( A ) ) + ) { +#ifdef _DEBUG + std::cout << "Mismatch of mask size ( " + << internal::getCoordinates( mask ).size() << " ) versus matrix rows " + << "or columns ( " << nrows( A ) << " or " << ncols( A ) << " with " + << "transposed value " << ((int)transposed) << "\n"; +#endif + return MISMATCH; + } + } + + // handle resize phase + if( phase == RESIZE ) { + return SUCCESS; + } + + // get raw pointers + assert( phase == EXECUTE ); + const InputType1 * __restrict__ const x = internal::getRaw( v ); + const InputType3 * __restrict__ const z = internal::getRaw( mask ); + const InputType4 * __restrict__ const vm = internal::getRaw( v_mask ); + IOType * __restrict__ const y = internal::getRaw( u ); + + // check for illegal arguments + if( !(descr & descriptors::safe_overlap) && + reinterpret_cast< const void * >( y ) == + reinterpret_cast< const void * >( x ) + ) { + std::cerr << "Warning: grb::internal::vxm_generic called with overlapping " + << "input and output vectors.\n"; + return OVERLAP; + } + if( masked && (reinterpret_cast(y) == + reinterpret_cast(z)) + ) { + std::cerr << "Warning: grb::internal::vxm_generic called with overlapping " + << "mask and output vectors.\n"; + return OVERLAP; + } + +#ifdef _DEBUG + std::cout << s << ": performing SpMV / SpMSpV using an " << nrows( A ) + << " by " << ncols( A ) << " matrix holding " << nnz( A ) + << " nonzeroes.\n"; +#endif + + // in the current design for ascend execution, the input vectors of + // vxm_generic // cannot be overwritten by another stage of the same + // pipeline, and therefore, it's safe to rely on the global coordinates of + // the input vectors, as they are read-only this property is of special + // importance when handling matrices of size "m" x "n" since the mismatch + // between "m" and "n" requires special handling for the local coordinates of + // the input vectors, the current design relies on the size of the output + // vector which should match the sizes of all other vectors in the pipeline + // the size of the input vector does not have to match the size of the other + // vectors as long as the input vectors are read-only + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [ + &u, &mask, &v, &v_mask, &A, &add, &mul, + row_l2g, row_g2l, col_l2g, col_g2l, + y, x, z, vm +#ifdef _DEBUG + , s +#endif + ] ( + internal::Pipeline &pipeline, + const size_t lower_bound, const size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage vxm_generic in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + (void) pipeline; + + RC rc = SUCCESS; + + Coords local_u, local_mask; + const size_t local_n = upper_bound - lower_bound; + size_t local_mask_nz = local_n; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_output = true; + bool already_dense_output_mask = true; + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( u ) ); + if( !already_dense_output ) { +#else + already_dense_output = false; +#endif + local_u = internal::getCoordinates( u ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + if( masked ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_output_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( mask ) ); + if( !already_dense_output_mask ) { +#else + already_dense_output_mask = false; +#endif + local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound, + upper_bound ); + local_mask_nz = local_mask.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + } + + // check if transpose is required + if( descr & descriptors::transpose_matrix ) { + // start compute u=vA^T +#ifdef _DEBUG + std::cout << s << ": in u=vA^T=Av variant\n"; +#endif + + // start u=vA^T using CRS + // matrix = &(A.CRS); + // TODO internal issue #193 + if( !masked || (descr & descriptors::invert_mask) ) { + // loop over all columns of the input matrix (can be done in parallel): +#ifdef _DEBUG + std::cout << s << ": in full CRS variant (gather)\n"; +#endif + + for( size_t i = lower_bound; i < upper_bound; i++ ) { +#ifdef GRB_BOOLEAN_DISPATCHER + boolean_dispatcher_vxm_inner_kernel_gather< +#else + vxm_inner_kernel_gather< +#endif + descr, masked, input_masked, left_handed, One + >( + already_dense_output, already_dense_output_mask, + rc, lower_bound, local_u, local_mask, + u, y[ i ], i, v, x, nrows( A ), internal::getCRS( A ), + mask, z, v_mask, vm, add, mul, + row_l2g, col_l2g, col_g2l + ); + } + + } else { +#ifdef _DEBUG + std::cout << s << ": in masked CRS variant (gather). Mask has " + << local_mask_nz << " nonzeroes and size " << local_n << ":\n"; + for( size_t k = 0; k < local_mask_nz; ++k ) { + std::cout << " " + << ( ( already_dense_output_mask ? k : local_mask.index( k ) ) + + lower_bound ); + } + std::cout << "\n"; +#endif + assert( masked ); + + for( size_t k = 0; k < local_mask_nz; ++k ) { + const size_t i = + ( already_dense_output_mask ? k : local_mask.index( k ) ) + + lower_bound; + assert( i < nrows(A) ); + +#ifdef GRB_BOOLEAN_DISPATCHER + boolean_dispatcher_vxm_inner_kernel_gather< +#else + vxm_inner_kernel_gather< +#endif + descr, false, input_masked, left_handed, One + >( + already_dense_output, already_dense_output_mask, + rc, lower_bound, local_u, local_mask, + u, y[ i ], i, v, x, nrows( A ), internal::getCRS( A ), + mask, z, v_mask, vm, add, mul, + row_l2g, col_l2g, col_g2l + ); + } + } + // end compute u=vA^T + } else { +#ifdef _DEBUG + std::cout << s << ": in u=vA=A^Tv variant\n"; +#endif + // start u=vA using CCS +#ifdef _DEBUG + std::cout << s << ": in column-major vector times matrix variant (u=vA)\n" + << "\t(this variant relies on the gathering inner kernel)\n"; +#endif + + // if not transposed, then CCS is the data structure to go: + // TODO internal issue #193 + if( !masked || (descr & descriptors::invert_mask) ) { +#ifdef _DEBUG + std::cout << s << ": loop over all input matrix columns\n"; +#endif + + for( size_t j = lower_bound; j < upper_bound; j++ ) { +#ifdef GRB_BOOLEAN_DISPATCHER + boolean_dispatcher_vxm_inner_kernel_gather< +#else + vxm_inner_kernel_gather< +#endif + descr, masked, input_masked, left_handed, One + >( + already_dense_output, already_dense_output_mask, + rc, lower_bound, local_u, local_mask, + u, y[ j ], j, v, x, nrows( A ), internal::getCCS( A ), + mask, z, v_mask, vm, add, mul, + row_l2g, row_g2l, col_l2g + ); + } + } else { + // loop only over the nonzero masks (can still be done in parallel!) +#ifdef _DEBUG + std::cout << s << ": loop over mask indices\n"; +#endif + assert( masked ); + + for( size_t k = 0; k < local_mask_nz; ++k ) { + const size_t j = + ( already_dense_output_mask ? k : local_mask.index( k ) ) + lower_bound; +#ifdef GRB_BOOLEAN_DISPATCHER + boolean_dispatcher_vxm_inner_kernel_gather< +#else + vxm_inner_kernel_gather< +#endif + descr, masked, input_masked, left_handed, One + >( + already_dense_output, already_dense_output_mask, + rc, lower_bound, local_u, local_mask, + u, y[ j ], j, v, x, nrows( A ), internal::getCCS( A ), + mask, z, v_mask, vm, add, mul, + row_l2g, row_g2l, col_l2g + ); + } + } + // end computing u=vA + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + if( !already_dense_output ) { +#else + if( !already_dense_vectors ) { +#endif + internal::getCoordinates( u ).asyncJoinSubset( local_u, lower_bound, + upper_bound ); + } + + return rc; + }; + + // since the local coordinates are never used for the input vector and the + // input mask they are added only for verification of legal usage of the + // dense descriptor + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::BLAS2_VXM_GENERIC, + size( u ), sizeof( IOType ), dense_descr, true, + &u, nullptr, &internal::getCoordinates( u ), nullptr, + &v, + masked ? &mask : nullptr, + input_masked ? &v_mask : nullptr, + nullptr, + &internal::getCoordinates( v ), + masked ? &internal::getCoordinates( mask ) : nullptr, + input_masked ? &internal::getCoordinates( v_mask ) : nullptr, + nullptr, + &A + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: vxm_generic" << std::endl; +#endif + +#ifdef _DEBUG + std::cout << s << ": exiting SpMV / SpMSpV.\n" << std::flush; +#endif + return ret; + } + + } // namespace internal + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename RIT, + typename CIT, + typename NIT, + typename Coords + > + RC vxm( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &v, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Ring >::value, void + >::type * const = nullptr + ) { + const Vector< bool, ascend, Coords > empty_mask( 0 ); + return vxm< descr, true, false >( u, mask, v, empty_mask, A, ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class AdditiveMonoid, + class MultiplicativeOperator, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename RIT, + typename CIT, + typename NIT, + typename Coords + > + RC vxm( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &v, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const AdditiveMonoid &add = AdditiveMonoid(), + const MultiplicativeOperator &mul = MultiplicativeOperator(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_monoid< AdditiveMonoid >::value && + grb::is_operator< MultiplicativeOperator >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + !std::is_same< InputType2, void >::value, + void >::type * const = nullptr + ) { + const grb::Vector< bool, ascend, Coords > empty_mask( 0 ); + return vxm< descr, true, false >( u, mask, v, empty_mask, A, add, mul, + phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + bool output_may_be_masked = true, + bool input_may_be_masked = true, + class Ring, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename InputType4, + typename RIT, + typename CIT, + typename NIT, + typename Coords + > + RC vxm( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &v, + const Vector< InputType4, ascend, Coords > &v_mask, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + constexpr bool left_sided = true; + if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) { + + return internal::vxm_generic< + descr, true, false, left_sided, true, Ring::template One + >( + u, mask, v, v_mask, A, + ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(), + phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else if( input_may_be_masked && size( mask ) == 0 && size( v_mask ) > 0 ) { + return internal::vxm_generic< + descr, false, true, left_sided, true, Ring::template One + >( + u, mask, v, v_mask, A, + ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(), + phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 && size( v_mask ) > 0 ) { + return internal::vxm_generic< + descr, true, true, left_sided, true, Ring::template One + >( + u, mask, v, v_mask, A, + ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(), + phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else { + assert( size( mask ) == 0 ); + assert( size( v_mask ) == 0 ); + return internal::vxm_generic< + descr, false, false, left_sided, true, Ring::template One + >( + u, mask, v, v_mask, A, + ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(), + phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename Coords, + typename RIT, + typename CIT, + typename NIT, + typename IOType = typename Ring::D4, + typename InputType1 = typename Ring::D1, + typename InputType2 = typename Ring::D2 + > + RC vxm( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType1, ascend, Coords > &v, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const Ring &ring = Ring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Ring >::value, void + >::type * const = nullptr + ) { + const Vector< bool, ascend, Coords > empty_mask( 0 ); + return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, ring, + phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class AdditiveMonoid, + class MultiplicativeOperator, + typename IOType, + typename InputType1, + typename InputType2, + typename RIT, + typename CIT, + typename NIT, + typename Coords + > + RC vxm( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType1, ascend, Coords > &v, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const AdditiveMonoid &add = AdditiveMonoid(), + const MultiplicativeOperator &mul = MultiplicativeOperator(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_monoid< AdditiveMonoid >::value && + grb::is_operator< MultiplicativeOperator >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !std::is_same< InputType2, void >::value, + void >::type * const = nullptr + ) { + const Vector< bool, ascend, Coords > empty_mask( 0 ); + return vxm< descr, false, false >( u, empty_mask, v, empty_mask, A, add, mul, + phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename Coords, + typename RIT, + typename CIT, + typename NIT, + typename IOType = typename Ring::D4, + typename InputType1 = typename Ring::D1, + typename InputType2 = typename Ring::D2, + typename InputType3 = bool + > + RC mxv( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const Vector< InputType1, ascend, Coords > &v, + const Ring &ring, + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Ring >::value, void + >::type * const = nullptr + ) { + const Vector< bool, ascend, Coords > empty_mask( 0 ); + return mxv< descr, true, false >( u, mask, A, v, empty_mask, ring, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + bool output_may_be_masked = true, + bool input_may_be_masked = true, + class Ring, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename InputType4, + typename RIT, + typename CIT, + typename NIT, + typename Coords + > + RC mxv( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const Vector< InputType1, ascend, Coords > &v, + const Vector< InputType4, ascend, Coords > &v_mask, + const Ring &ring, + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + constexpr Descriptor new_descr = descr ^ descriptors::transpose_matrix; + constexpr bool left_sided = false; + if( output_may_be_masked && ( size( v_mask ) == 0 && size( mask ) > 0 ) ) { + + return internal::vxm_generic< + new_descr, true, false, left_sided, true, Ring::template One + >( + u, mask, v, v_mask, A, + ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(), + phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else if( input_may_be_masked && ( size( mask ) == 0 && + size( v_mask ) > 0 ) + ) { + return internal::vxm_generic< + new_descr, false, true, left_sided, true, Ring::template One + >( + u, mask, v, v_mask, A, + ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(), + phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 && + size( v_mask ) > 0 + ) { + return internal::vxm_generic< + new_descr, true, true, left_sided, true, Ring::template One + >( + u, mask, v, v_mask, A, + ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(), + phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else { + assert( size( mask ) == 0 ); + assert( size( v_mask ) == 0 ); + return internal::vxm_generic< + new_descr, false, false, left_sided, true, Ring::template One + >( + u, mask, v, v_mask, A, + ring.getAdditiveMonoid(), ring.getMultiplicativeOperator(), + phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } + } + + template< + Descriptor descr = descriptors::no_operation, + class Ring, + typename Coords, + typename RIT, + typename CIT, + typename NIT, + typename IOType = typename Ring::D4, + typename InputType1 = typename Ring::D1, + typename InputType2 = typename Ring::D2 + > + RC mxv( + Vector< IOType, ascend, Coords > &u, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const Vector< InputType1, ascend, Coords > &v, + const Ring &ring, + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_semiring< Ring >::value, + void >::type * const = nullptr + ) { + const Vector< bool, ascend, Coords > empty_mask( 0 ); + return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, ring, + phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class AdditiveMonoid, + class MultiplicativeOperator, + typename IOType, + typename InputType1, + typename InputType2, + typename RIT, + typename CIT, + typename NIT, + typename Coords + > + RC mxv( + Vector< IOType, ascend, Coords > &u, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const Vector< InputType1, ascend, Coords > &v, + const AdditiveMonoid &add = AdditiveMonoid(), + const MultiplicativeOperator &mul = MultiplicativeOperator(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_monoid< AdditiveMonoid >::value && + grb::is_operator< MultiplicativeOperator >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !std::is_same< InputType2, void >::value, + void >::type * const = nullptr + ) { + const Vector< bool, ascend, Coords > empty_mask( 0 ); + return mxv< descr, false, false >( u, empty_mask, A, v, empty_mask, add, mul, + phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + bool output_may_be_masked = true, + bool input_may_be_masked = true, + class AdditiveMonoid, + class MultiplicativeOperator, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename InputType4, + typename RIT, + typename CIT, + typename NIT, + typename Coords + > + RC vxm( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &v, + const Vector< InputType4, ascend, Coords > &v_mask, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const AdditiveMonoid &add = AdditiveMonoid(), + const MultiplicativeOperator &mul = MultiplicativeOperator(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_monoid< AdditiveMonoid >::value && + grb::is_operator< MultiplicativeOperator >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + !grb::is_object< InputType4 >::value && + !std::is_same< InputType2, void >::value, + void >::type * const = nullptr + ) { + static_assert( !(descr & descriptors::add_identity), "Cannot add an " + "identity if no concept of `one' is known. Suggested fix: use a semiring " + "instead." ); + constexpr bool left_sided = true; + if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) { + return internal::vxm_generic< + descr, true, false, left_sided, false, AdditiveMonoid::template Identity + >( + u, mask, v, v_mask, A, add, mul, phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else if( input_may_be_masked && size( v_mask ) > 0 && size( mask ) == 0 ) { + return internal::vxm_generic< + descr, false, true, left_sided, false, AdditiveMonoid::template Identity + >( + u, mask, v, v_mask, A, add, mul, phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 && + size( v_mask ) > 0 + ) { + return internal::vxm_generic< + descr, true, true, left_sided, false, AdditiveMonoid::template Identity + >( + u, mask, v, v_mask, A, add, mul, phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else { + assert( size( mask ) == 0 ); + assert( size( v_mask ) == 0 ); + return internal::vxm_generic< + descr, false, false, left_sided, false, AdditiveMonoid::template Identity + >( + u, mask, v, v_mask, A, add, mul, phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } + } + + template< + Descriptor descr = descriptors::no_operation, + bool output_may_be_masked = true, + bool input_may_be_masked = true, + class AdditiveMonoid, + class MultiplicativeOperator, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename InputType4, + typename RIT, + typename CIT, + typename NIT, + typename Coords + > + RC mxv( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const Vector< InputType1, ascend, Coords > &v, + const Vector< InputType4, ascend, Coords > &v_mask, + const AdditiveMonoid &add = AdditiveMonoid(), + const MultiplicativeOperator &mul = MultiplicativeOperator(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_monoid< AdditiveMonoid >::value && + grb::is_operator< MultiplicativeOperator >::value && + !grb::is_object< IOType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< InputType3 >::value && + !grb::is_object< InputType4 >::value && + !std::is_same< InputType2, void >::value, + void >::type * const = nullptr + ) { + static_assert( !(descr & descriptors::add_identity), "Cannot add an identity " + "if no concept of `1' is known. Suggested fix: use a semiring " + "instead." ); + constexpr Descriptor new_descr = descr ^ descriptors::transpose_matrix; + constexpr bool left_sided = false; + if( output_may_be_masked && size( v_mask ) == 0 && size( mask ) > 0 ) { + return internal::vxm_generic< + new_descr, true, false, left_sided, false, AdditiveMonoid::template Identity + >( + u, mask, v, v_mask, A, add, mul, phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else if( input_may_be_masked && size( mask ) == 0 && + size( v_mask ) > 0 + ) { + return internal::vxm_generic< + new_descr, false, true, left_sided, false, AdditiveMonoid::template Identity + >( + u, mask, v, v_mask, A, add, mul, phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else if( output_may_be_masked && input_may_be_masked && size( mask ) > 0 && + size( v_mask ) > 0 + ) { + return internal::vxm_generic< + new_descr, true, true, left_sided, false, AdditiveMonoid::template Identity + >( + u, mask, v, v_mask, A, add, mul, phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } else { + assert( size( mask ) == 0 ); + assert( size( v_mask ) == 0 ); + return internal::vxm_generic< + new_descr, false, false, left_sided, false, AdditiveMonoid::template Identity + >( + u, mask, v, v_mask, A, add, mul, phase, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + }, + []( const size_t i ) { + return i; + } + ); + } + } + + template< + class ActiveDistribution, + typename Func, + typename DataType, + typename RIT, + typename CIT, + typename NIT + > + RC eWiseLambda( + const Func f, + const Matrix< DataType, ascend, RIT, CIT, NIT > &A, + const size_t s, + const size_t P + ) { + // ascend execution is not supported + // first, execute any computation that is not completed + internal::le.execution(); + + // second, delegate to the reference backend + return eWiseLambda< ActiveDistribution, Func, DataType, RIT, CIT, NIT >( + f, internal::getRefMatrix( A ), s, P ); + } + + template< + typename Func, + typename DataType1, + typename RIT, + typename CIT, + typename NIT, + typename DataType2, + typename Coords, + typename... Args + > + RC eWiseLambda( + const Func f, + const Matrix< DataType1, ascend, RIT, CIT, NIT > &A, + const Vector< DataType2, ascend, Coords > &x, + Args... args + ) { + // do size checking + if( !( size( x ) == nrows( A ) || size( x ) == ncols( A ) ) ) { + std::cerr << "Mismatching dimensions: given vector of size " << size( x ) + << " has nothing to do with either matrix dimension (" << nrows( A ) + << " nor " << ncols( A ) << ").\n"; + return MISMATCH; + } + + return eWiseLambda( f, A, args... ); + } + + /** @} */ + +} // namespace grb + +#undef NO_CAST_ASSERT + +#endif // end _H_GRB_ASCEND_BLAS2 + diff --git a/include/graphblas/ascend/blas3.hpp b/include/graphblas/ascend/blas3.hpp new file mode 100644 index 000000000..d910caea6 --- /dev/null +++ b/include/graphblas/ascend/blas3.hpp @@ -0,0 +1,534 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Implements the level-3 primitives for the Ascend backend + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_BLAS3 +#define _H_GRB_ASCEND_BLAS3 + +#include //for std::enable_if + +#include +#include + +#include "io.hpp" +#include "matrix.hpp" + +#include + +#define NO_CAST_ASSERT( x, y, z ) \ + static_assert( x, \ + "\n\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* ERROR | " y " " z ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* Possible fix 1 | Remove no_casting from the template parameters " \ + "in this call to " y ".\n" \ + "* Possible fix 2 | For all mismatches in the domains of input " \ + "parameters and the semiring domains, as specified in the " \ + "documentation of the function " y ", supply a container argument of " \ + "the expected type instead.\n" \ + "* Possible fix 3 | Provide a compatible semiring where all domains " \ + "match those of the container arguments, as specified in the " \ + "documentation of the function " y ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" ); + + +namespace grb { + + namespace internal { + + extern LazyEvaluation le; + + } + +} + +namespace grb { + + namespace internal { + + template< + bool allow_void, + Descriptor descr, + class Monoid, + class Operator, + class MulMonoid, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC mxm_generic( + Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B, + const Operator &oper, + const Monoid &monoid, + const MulMonoid &mulMonoid, + const Phase &phase, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value && + grb::is_monoid< Monoid >::value, + void >::type * const = nullptr + ) { + // ascend execution is not supported + // first, execute any computation that is not completed + le.execution(); + + // second, delegate to the reference backend + return mxm_generic< allow_void, descr >( + getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ), + oper, monoid, mulMonoid, phase + ); + } + + } // end namespace grb::internal + + template< + Descriptor descr = descriptors::no_operation, + class Semiring, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC mxm( + Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B, + const Semiring &ring = Semiring(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_semiring< Semiring >::value, + void >::type * const = nullptr + ) { + // static checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Semiring::D1, InputType1 >::value + ), "grb::mxm", + "called with a prefactor input matrix A that does not match the first " + "domain of the given operator" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Semiring::D2, InputType2 >::value ), "grb::mxm", + "called with a postfactor input matrix B that does not match the " + "second domain of the given operator" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Semiring::D4, OutputType >::value + ), "grb::mxm", + "called with an output matrix C that does not match the output domain " + "of the given operator" ); + +#ifdef _DEBUG + std::cout << "In grb::mxm (ascend, unmasked, semiring)\n"; +#endif + + return internal::mxm_generic< true, descr >( + C, A, B, + ring.getMultiplicativeOperator(), + ring.getAdditiveMonoid(), + ring.getMultiplicativeMonoid(), + phase + ); + } + + template< + Descriptor descr = grb::descriptors::no_operation, + class Operator, + class Monoid, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC mxm( + Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B, + const Monoid &addM, + const Operator &mulOp, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value && + grb::is_monoid< Monoid >::value, + void >::type * const = nullptr + ) { + // static checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Operator::D1, InputType1 >::value + ), "grb::mxm", + "called with a prefactor input matrix A that does not match the first " + "domain of the given multiplication operator" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Operator::D2, InputType2 >::value + ), "grb::mxm", + "called with a postfactor input matrix B that does not match the first " + "domain of the given multiplication operator" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Operator::D3, OutputType >::value ), + "grb::mxm", + "called with an output matrix C that does not match the output domain " + "of the given multiplication operator" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D1, typename Operator::D3 >::value + ), "grb::mxm", + "the output domain of the multiplication operator does not match the " + "first domain of the given addition monoid" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D2, OutputType >::value + ), "grb::mxm", + "the second domain of the given addition monoid does not match the " + "type of the output matrix C" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< typename Monoid::D3, OutputType >::value + ), "grb::mxm", + "the output type of the given addition monoid does not match the type " + "of the output matrix C" ); + static_assert( ( !( + std::is_same< InputType1, void >::value || + std::is_same< InputType2, void >::value + ) ), + "grb::mxm: the operator-monoid version of mxm cannot be used if either " + "of the input matrices is a pattern matrix (of type void)" ); + + return internal::mxm_generic< false, descr >( + C, A, B, mulOp, addM, Monoid(), phase + ); + } + + namespace internal { + + template< + Descriptor descr = descriptors::no_operation, + bool matrix_is_void, + typename OutputType, + typename InputType1, typename InputType2, typename InputType3, + typename RIT, typename CIT, typename NIT, + typename Coords + > + RC matrix_zip_generic( + Matrix< OutputType, ascend, RIT, CIT, NIT > &A, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const Vector< InputType3, ascend, Coords > &z, + const Phase &phase + ) { + // ascend execution is not supported + // first, execute any computation that is not completed + le.execution(); + + // second, delegate to the reference backend + return matrix_zip_generic< descr, matrix_is_void >( + getRefMatrix( A ), getRefVector( x ), getRefVector( y ), getRefVector( z ), + phase + ); + } + + } // namespace internal + + template< + Descriptor descr = descriptors::no_operation, + typename OutputType, + typename InputType1, typename InputType2, typename InputType3, + typename RIT, typename CIT, typename NIT, + typename Coords + > + RC zip( + Matrix< OutputType, ascend, RIT, CIT, NIT > &A, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const Vector< InputType3, ascend, Coords > &z, + const Phase &phase = EXECUTE + ) { + static_assert( !(descr & descriptors::no_casting) || + std::is_integral< InputType1 >::value, + "grb::zip (two vectors to matrix) called " + "using non-integral left-hand vector elements" ); + static_assert( !(descr & descriptors::no_casting) || + std::is_integral< InputType2 >::value, + "grb::zip (two vectors to matrix) called " + "using non-integral right-hand vector elements" ); + static_assert( !(descr & descriptors::no_casting) || + std::is_same< OutputType, InputType3 >::value, + "grb::zip (two vectors to matrix) called " + "with differing vector nonzero and output matrix domains" ); + + const size_t n = grb::size( x ); + const size_t nz = grb::nnz( x ); + const RC ret = grb::clear( A ); + if( ret != SUCCESS ) { + return ret; + } + if( n != grb::size( y ) ) { + return MISMATCH; + } + if( n != grb::size( z ) ) { + return MISMATCH; + } + if( nz != grb::nnz( y ) ) { + return ILLEGAL; + } + if( nz != grb::nnz( z ) ) { + return ILLEGAL; + } + + return internal::matrix_zip_generic< descr, false >( A, x, y, z, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + typename InputType1, typename InputType2, + typename RIT, typename CIT, typename NIT, + typename Coords + > + RC zip( + Matrix< void, ascend, RIT, CIT, NIT > &A, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const Phase &phase = EXECUTE + ) { + static_assert( !(descr & descriptors::no_casting) || + std::is_integral< InputType1 >::value, + "grb::zip (two vectors to void matrix) called using non-integral " + "left-hand vector elements" ); + static_assert( !(descr & descriptors::no_casting) || + std::is_integral< InputType2 >::value, + "grb::zip (two vectors to void matrix) called using non-integral " + "right-hand vector elements" ); + + const size_t n = grb::size( x ); + const size_t nz = grb::nnz( x ); + const RC ret = grb::clear( A ); + if( ret != SUCCESS ) { + return ret; + } + if( n != grb::size( y ) ) { + return MISMATCH; + } + if( nz != grb::nnz( y ) ) { + return ILLEGAL; + } + + return internal::matrix_zip_generic< descr, true >( A, x, y, x, phase ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Operator, + typename InputType1, typename InputType2, typename OutputType, + typename Coords, + typename RIT, typename CIT, typename NIT + > + RC outer( + Matrix< OutputType, ascend, RIT, CIT, NIT > &A, + const Vector< InputType1, ascend, Coords > &u, + const Vector< InputType2, ascend, Coords > &v, + const Operator &mul = Operator(), + const Phase &phase = EXECUTE, + const typename std::enable_if< + grb::is_operator< Operator >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + !grb::is_object< OutputType >::value, + void >::type * const = nullptr + ) { + // ascend execution is not supported + // first, execute any computation that is not completed + internal::le.execution(); + + // second, delegate to the reference backend + return outer< descr, Operator >( + internal::getRefMatrix( A ), + internal::getRefVector( u ), internal::getRefVector( v ), + mul, phase + ); + } + + namespace internal { + + template< + bool allow_void, + Descriptor descr, + class MulMonoid, class Operator, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC eWiseApply_matrix_generic( + Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B, + const Operator &oper, + const MulMonoid &mulMonoid, + const Phase &phase, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value, + void >::type * const = nullptr + ) { + // ascend execution is not supported + // first, execute any computation that is not completed + le.execution(); + + // second, delegate to the reference backend + return eWiseApply_matrix_generic< + allow_void, descr, + MulMonoid, Operator + >( + getRefMatrix( C ), getRefMatrix( A ), getRefMatrix( B ), + oper, mulMonoid, phase + ); + } + + } // namespace internal + + template< + Descriptor descr = descriptors::no_operation, + class MulMonoid, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC eWiseApply( + Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B, + const MulMonoid &mulmono, + const Phase phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_monoid< MulMonoid >::value, + void >::type * const = nullptr + ) { + // static checks + NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || + std::is_same< typename MulMonoid::D1, InputType1 >::value ), + "grb::eWiseApply (ascend, matrix <- matrix x matrix, monoid)", + "called with a prefactor input matrix A that does not match the first " + "domain of the monoid operator" + ); + NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || + std::is_same< typename MulMonoid::D2, InputType2 >::value ), + "grb::eWiseApply (ascend, matrix <- matrix x matrix, monoid)", + "called with a postfactor input matrix B that does not match the " + "second domain of the monoid operator" + ); + NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || + std::is_same< typename MulMonoid::D3, OutputType >::value ), + "grb::eWiseApply (ascend, matrix <- matrix x matrix, monoid)", + "called with an output matrix C that does not match the output domain " + "of the monoid operator" + ); + +#ifdef _DEBUG + std::cout << "In grb::eWiseApply_matrix_generic (ascend, monoid)\n"; +#endif + + return internal::eWiseApply_matrix_generic< true, descr >( + C, A, B, mulmono.getOperator(), mulmono, phase + ); + } + + template< + Descriptor descr = grb::descriptors::no_operation, + class Operator, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2, + typename RIT3, typename CIT3, typename NIT3 + > + RC eWiseApply( + Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A, + const Matrix< InputType2, ascend, RIT3, CIT3, NIT3 > &B, + const Operator &mulOp, + const Phase phase = EXECUTE, + const typename std::enable_if< !grb::is_object< OutputType >::value && + !grb::is_object< InputType1 >::value && + !grb::is_object< InputType2 >::value && + grb::is_operator< Operator >::value, + void >::type * const = nullptr + ) { + // static checks + NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || + std::is_same< typename Operator::D1, InputType1 >::value ), + "grb::eWiseApply (ascend, matrix <- matrix x matrix, operator)", + "called with a prefactor input matrix A that does not match the first " + "domain of the given multiplication operator" + ); + NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || + std::is_same< typename Operator::D2, InputType2 >::value ), + "grb::eWiseApply (ascend, matrix <- matrix x matrix, operator)", + "called with a postfactor input matrix B that does not match the first " + "domain of the given multiplication operator" + ); + NO_CAST_ASSERT( ( !( descr & descriptors::no_casting ) || + std::is_same< typename Operator::D3, OutputType >::value ), + "grb::eWiseApply (ascend, matrix <- matrix x matrix, operator)", + "called with an output matrix C that does not match the output domain " + "of the given multiplication operator" + ); + static_assert( ( !( + std::is_same< InputType1, void >::value || + std::is_same< InputType2, void >::value ) + ), "grb::eWiseApply (ascend, matrix <- matrix x matrix, operator): " + "the operator version of eWiseApply cannot be used if either of the " + "input matrices is a pattern matrix (of type void)" + ); + + typename grb::Monoid< + grb::operators::mul< double >, + grb::identities::one + > dummyMonoid; + return internal::eWiseApply_matrix_generic< false, descr >( + C, A, B, mulOp, dummyMonoid, phase + ); + } + +} // namespace grb + +#undef NO_CAST_ASSERT + +#endif // ``_H_GRB_ASCEND_BLAS3'' + diff --git a/include/graphblas/ascend/boolean_dispatcher_blas1.hpp b/include/graphblas/ascend/boolean_dispatcher_blas1.hpp new file mode 100644 index 000000000..d093db955 --- /dev/null +++ b/include/graphblas/ascend/boolean_dispatcher_blas1.hpp @@ -0,0 +1,1738 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +#ifndef _H_GRB_ASCEND_BOOLEAN_DISPATCHER_BLAS1 +#define _H_GRB_ASCEND_BOOLEAN_DISPATCHER_BLAS1 + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "coordinates.hpp" +#include "vector.hpp" +#include "vector_wrapper.hpp" + + +namespace grb { + + namespace internal { + + template< + Descriptor descr, + bool masked, + bool left, + bool already_dense_input_to_fold, + bool already_dense_mask, + class Monoid, + typename InputType, + typename MaskType, + class Coords + > + RC fold_from_vector_to_scalar_vectorDriven( + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_to_fold, + const Coords &local_mask, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ); + + template< + Descriptor descr, + bool masked, + bool left, + class Monoid, + typename InputType, + typename MaskType, + class Coords + > + RC boolean_dispatcher_fold_from_vector_to_scalar_vectorDriven( + const bool already_dense_input_to_fold, + const bool already_dense_mask, + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_to_fold, + const Coords &local_mask, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ) { + if( already_dense_input_to_fold ) { + if( already_dense_mask ) { + return internal::fold_from_vector_to_scalar_vectorDriven< + descr, masked, left, true, true + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } else { + return internal::fold_from_vector_to_scalar_vectorDriven< + descr, masked, left, true, false + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } + } else { + if( already_dense_mask ) { + return internal::fold_from_vector_to_scalar_vectorDriven< + descr, masked, left, false, true + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } else { + return internal::fold_from_vector_to_scalar_vectorDriven< + descr, masked, left, false, false + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } + } + } + + template< + Descriptor descr, + bool left, + bool already_dense_input_to_fold, + bool already_dense_mask, + class Monoid, + typename InputType, + typename MaskType, + class Coords + > + RC fold_from_vector_to_scalar_maskDriven( + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_to_fold, + const Coords &local_mask, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ); + + template< + Descriptor descr, + bool left, + class Monoid, + typename InputType, + typename MaskType, + class Coords + > + RC boolean_dispatcher_fold_from_vector_to_scalar_maskDriven( + const bool already_dense_input_to_fold, + const bool already_dense_mask, + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_to_fold, + const Coords &local_mask, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ) { + if( already_dense_input_to_fold ) { + if( already_dense_mask ) { + return internal::fold_from_vector_to_scalar_maskDriven< + descr, left, true, true + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } else { + return internal::fold_from_vector_to_scalar_maskDriven< + descr, left, true, false + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } + } else { + if( already_dense_mask ) { + return internal::fold_from_vector_to_scalar_maskDriven< + descr, left, false, true + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } else { + return internal::fold_from_vector_to_scalar_maskDriven< + descr, left, false, false + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } + } + } + + template< + Descriptor descr, + bool masked, + bool left, + bool already_dense_input_to_fold, + bool already_dense_mask, + class Monoid, + typename InputType, + typename MaskType, + class Coords + > + RC fold_from_vector_to_scalar_fullLoopSparse( + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_to_fold, + const Coords &local_mask, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ); + + template< + Descriptor descr, + bool masked, + bool left, + class Monoid, + typename InputType, + typename MaskType, + class Coords + > + RC boolean_dispatcher_fold_from_vector_to_scalar_fullLoopSparse( + const bool already_dense_input_to_fold, + const bool already_dense_mask, + typename Monoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_to_fold, + const Coords &local_mask, + const Vector< InputType, ascend, Coords > &to_fold, + const Vector< MaskType, ascend, Coords > &mask, + const Monoid &monoid + ) { + if( already_dense_input_to_fold ) { + if( already_dense_mask ) { + return internal::fold_from_vector_to_scalar_fullLoopSparse< + descr, masked, left, true, true + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } else { + return internal::fold_from_vector_to_scalar_fullLoopSparse< + descr, masked, left, true, false + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } + } else { + if( already_dense_mask ) { + return internal::fold_from_vector_to_scalar_fullLoopSparse< + descr, masked, left, false, true + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } else { + return internal::fold_from_vector_to_scalar_fullLoopSparse< + descr, masked, left, false, false + >( + thread_local_output, lower_bound, upper_bound, + local_to_fold, local_mask, to_fold, mask, monoid + ); + } + } + } + + template< Descriptor descr, + bool left, + bool sparse, + bool masked, + bool monoid, + bool already_dense_output, + bool already_dense_mask, + typename MaskType, + typename IOType, + typename InputType, + typename Coords, + class OP + > + RC fold_from_scalar_to_vector_generic( + const size_t lower_bound, + const size_t upper_bound, + Coords &local_vector, + const Coords * const local_mask_ptr, + Vector< IOType, ascend, Coords > &vector, + const Vector< MaskType, ascend, Coords > * const mask, + const InputType &scalar, + const OP &op, + const Phase &phase + ); + + template< Descriptor descr, + bool left, + bool sparse, + bool masked, + bool monoid, + typename MaskType, + typename IOType, + typename InputType, + typename Coords, + class OP + > + RC boolean_dispatcher_fold_from_scalar_to_vector_generic( + const bool already_dense_output, + const bool already_dense_mask, + const size_t lower_bound, + const size_t upper_bound, + Coords &local_vector, + const Coords * const local_mask_ptr, + Vector< IOType, ascend, Coords > &vector, + const Vector< MaskType, ascend, Coords > * const mask, + const InputType &scalar, + const OP &op, + const Phase &phase + ) { + if( already_dense_output ) { + if( already_dense_mask ) { + return internal::fold_from_scalar_to_vector_generic< + descr, left, sparse, masked, monoid, + true, true + >( + lower_bound, upper_bound, local_vector, local_mask_ptr, + vector, mask, scalar, op, phase + ); + } else { + return internal::fold_from_scalar_to_vector_generic< + descr, left, sparse, masked, monoid, + true, false + >( + lower_bound, upper_bound, local_vector, local_mask_ptr, + vector, mask, scalar, op, phase + ); + } + } else { + if( already_dense_mask ) { + return internal::fold_from_scalar_to_vector_generic< + descr, left, sparse, masked, monoid, + false, true + >( + lower_bound, upper_bound, local_vector, local_mask_ptr, + vector, mask, scalar, op, phase + ); + } else { + return internal::fold_from_scalar_to_vector_generic< + descr, left, sparse, masked, monoid, + false, false + >( + lower_bound, upper_bound, local_vector, local_mask_ptr, + vector, mask, scalar, op, phase + ); + } + } + } + + template< Descriptor descr, + bool left, + bool sparse, + bool masked, + bool monoid, + bool already_dense_output, + bool already_dense_input_to_fold, + bool already_dense_mask, + typename MaskType, + typename IOType, + typename IType, + typename Coords, + class OP + > + RC fold_from_vector_to_vector_generic( + const size_t lower_bound, + const size_t upper_bound, + Coords &local_fold_into, + const Coords * const local_m_ptr, + const Coords &local_to_fold, + Vector< IOType, ascend, Coords > &fold_into, + const Vector< MaskType, ascend, Coords > * const m, + const Vector< IType, ascend, Coords > &to_fold, + const OP &op, + const Phase phase + ); + + template< Descriptor descr, + bool left, + bool sparse, + bool masked, + bool monoid, + typename MaskType, + typename IOType, + typename IType, + typename Coords, + class OP + > + RC boolean_dispatcher_fold_from_vector_to_vector_generic( + const bool already_dense_output, + const bool already_dense_input_to_fold, + const bool already_dense_mask, + const size_t lower_bound, + const size_t upper_bound, + Coords &local_fold_into, + const Coords * const local_m_ptr, + const Coords &local_to_fold, + Vector< IOType, ascend, Coords > &fold_into, + const Vector< MaskType, ascend, Coords > * const m, + const Vector< IType, ascend, Coords > &to_fold, + const OP &op, + const Phase phase + ) { + if( already_dense_output ) { + if( already_dense_input_to_fold ) { + if( already_dense_mask ) { + return internal::fold_from_vector_to_vector_generic< + descr, left, sparse, masked, monoid, + true, true, true + >( + lower_bound, upper_bound, local_fold_into, local_m_ptr, + local_to_fold, fold_into, m, to_fold, op, phase + ); + } else { + return internal::fold_from_vector_to_vector_generic< + descr, left, sparse, masked, monoid, + true, true, false + >( + lower_bound, upper_bound, local_fold_into, local_m_ptr, + local_to_fold, fold_into, m, to_fold, op, phase + ); + } + } else { + if( already_dense_mask ) { + return internal::fold_from_vector_to_vector_generic< + descr, left, sparse, masked, monoid, + true, false, true + >( + lower_bound, upper_bound, local_fold_into, local_m_ptr, + local_to_fold, fold_into, m, to_fold, op, phase + ); + } else { + return internal::fold_from_vector_to_vector_generic< + descr, left, sparse, masked, monoid, + true, false, false + >( + lower_bound, upper_bound, local_fold_into, local_m_ptr, + local_to_fold, fold_into, m, to_fold, op, phase + ); + } + } + } else { + if( already_dense_input_to_fold ) { + if( already_dense_mask ) { + return internal::fold_from_vector_to_vector_generic< + descr, left, sparse, masked, monoid, + false, true, true + >( + lower_bound, upper_bound, local_fold_into, local_m_ptr, + local_to_fold, fold_into, m, to_fold, op, phase + ); + } else { + return internal::fold_from_vector_to_vector_generic< + descr, left, sparse, masked, monoid, + false, true, false + >( + lower_bound, upper_bound, local_fold_into, local_m_ptr, + local_to_fold, fold_into, m, to_fold, op, phase + ); + } + } else { + if( already_dense_mask ) { + return internal::fold_from_vector_to_vector_generic< + descr, left, sparse, masked, monoid, + false, false, true + >( + lower_bound, upper_bound, local_fold_into, local_m_ptr, + local_to_fold, fold_into, m, to_fold, op, phase + ); + } else { + return internal::fold_from_vector_to_vector_generic< + descr, left, sparse, masked, monoid, + false, false, false + >( + lower_bound, upper_bound, local_fold_into, local_m_ptr, + local_to_fold, fold_into, m, to_fold, op, phase + ); + } + } + } + } + + template< + bool left_scalar, + bool right_scalar, + bool left_sparse, + bool right_sparse, + Descriptor descr, + class OP, + bool already_dense_input_x, + bool already_dense_input_y, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC dense_apply_generic( + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper, + const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper, + const OP &op + ); + + template< + bool left_scalar, + bool right_scalar, + bool left_sparse, + bool right_sparse, + Descriptor descr, + class OP, + typename OutputType, + typename InputType1, + typename InputType2, + typename Coords + > + RC boolean_dispatcher_dense_apply_generic( + const bool already_dense_input_x, + const bool already_dense_input_y, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper, + const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper, + const OP &op + ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::dense_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + true, true + >( + lower_bound, upper_bound, + local_x, local_y, z_vector, x_wrapper, y_wrapper, op + ); + } else { + return internal::dense_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + true, true + >( + lower_bound, upper_bound, + local_x, local_y, z_vector, x_wrapper, y_wrapper, op + ); + } + } else { + if( already_dense_input_y ) { + return internal::dense_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + true, true + >( + lower_bound, upper_bound, + local_x, local_y, z_vector, x_wrapper, y_wrapper, op + ); + } else { + return internal::dense_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + true, true + >( + lower_bound, upper_bound, + local_x, local_y, z_vector, x_wrapper, y_wrapper, op + ); + } + } + } + + template< + bool masked, + bool monoid, + bool x_scalar, + bool y_scalar, + Descriptor descr, + class OP, + bool already_dense_mask, + bool already_dense_input_x, + bool already_dense_input_y, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC sparse_apply_generic( + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords * const local_mask_ptr, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > * const mask_vector, + const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper, + const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper, + const OP &op + ); + + template< + bool masked, + bool monoid, + bool x_scalar, + bool y_scalar, + Descriptor descr, + class OP, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC boolean_dispatcher_sparse_apply_generic( + const bool already_dense_mask, + const bool already_dense_input_x, + const bool already_dense_input_y, + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords * const local_mask_ptr, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > * const mask_vector, + const internal::Wrapper< x_scalar, InputType1, Coords > x_wrapper, + const internal::Wrapper< y_scalar, InputType2, Coords > y_wrapper, + const OP &op + ) { + if( already_dense_mask ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_apply_generic< + masked, monoid, x_scalar, y_scalar, descr, OP, + true, true, true + > ( + lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, op + ); + } else { + return internal::sparse_apply_generic< + masked, monoid, x_scalar, y_scalar, descr, OP, + true, true, false + > ( + lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, op + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_apply_generic< + masked, monoid, x_scalar, y_scalar, descr, OP, + true, false, true + > ( + lower_bound, upper_bound, + local_z, local_mask_ptr, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, op + ); + } else { + return internal::sparse_apply_generic< + masked, monoid, x_scalar, y_scalar, descr, OP, + true, false, false + > ( + lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, op + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_apply_generic< + masked, monoid, x_scalar, y_scalar, descr, OP, + false, true, true + > ( + lower_bound, upper_bound, + local_z, local_mask_ptr, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, op + ); + } else { + return internal::sparse_apply_generic< + masked, monoid, x_scalar, y_scalar, descr, OP, + false, true, false + > ( + lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, op + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_apply_generic< + masked, monoid, x_scalar, y_scalar, descr, OP, + false, false, true + > ( + lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, op + ); + } else { + return internal::sparse_apply_generic< + masked, monoid, x_scalar, y_scalar, descr, OP, + false, false, false + > ( + lower_bound, upper_bound, local_z, local_mask_ptr, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, op + ); + } + } + } + } + + template< + bool left_scalar, + bool right_scalar, + bool left_sparse, + bool right_sparse, + Descriptor descr, + class OP, + bool already_dense_mask, + bool already_dense_input_x, + bool already_dense_input_y, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC masked_apply_generic( + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords &local_mask, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > &mask_vector, + const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper, + const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper, + const OP &op, +#ifdef GRB_BOOLEAN_DISPATCHER + const InputType1 * const left_identity, + const InputType2 * const right_identity +#else + const InputType1 * const left_identity = nullptr, + const InputType2 * const right_identity = nullptr +#endif + ); + + template< + bool left_scalar, + bool right_scalar, + bool left_sparse, + bool right_sparse, + Descriptor descr, + class OP, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename Coords + > + RC boolean_dispatcher_masked_apply_generic( + const bool already_dense_mask, + const bool already_dense_input_x, + const bool already_dense_input_y, + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords &local_mask, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > &mask_vector, + const internal::Wrapper< left_scalar, InputType1, Coords > x_wrapper, + const internal::Wrapper< right_scalar, InputType2, Coords > y_wrapper, + const OP &op, + const InputType1 * const left_identity = nullptr, + const InputType2 * const right_identity = nullptr + ) { + if( already_dense_mask ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::masked_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + true, true, true + >( + lower_bound, upper_bound, local_z, local_mask, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, op, left_identity, right_identity + ); + } else { + return internal::masked_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + true, true, false + >( + lower_bound, upper_bound, local_z, local_mask, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, + op, left_identity, right_identity + ); + } + } else { + if( already_dense_input_y ) { + return internal::masked_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + true, false, true + >( + lower_bound, upper_bound, local_z, local_mask, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, + op, left_identity, right_identity + ); + } else { + return internal::masked_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + true, false, false + >( + lower_bound, upper_bound, local_z, local_mask, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, + op, left_identity, right_identity + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::masked_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + false, true, true + >( + lower_bound, upper_bound, local_z, local_mask, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, + op, left_identity, right_identity + ); + } else { + return internal::masked_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + false, true, false + >( + lower_bound, upper_bound, local_z, local_mask, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, + op, left_identity, right_identity + ); + } + } else { + if( already_dense_input_y ) { + return internal::masked_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + false, false, true + >( + lower_bound, upper_bound, local_z, local_mask, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, + op, left_identity, right_identity + ); + } else { + return internal::masked_apply_generic< + left_scalar, right_scalar, left_sparse, right_sparse, descr, OP, + false, false, false + >( + lower_bound, upper_bound, local_z, local_mask, local_x, local_y, + z_vector, mask_vector, x_wrapper, y_wrapper, + op, left_identity, right_identity + ); + } + } + } + } + + template< + Descriptor descr, + bool a_scalar, + bool x_scalar, + bool y_scalar, + bool y_zero, + bool already_dense_output, + bool already_dense_mask, + bool already_dense_input_a, + bool already_dense_input_x, + bool already_dense_input_y, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords, + class Ring + > + RC sparse_eWiseMulAdd_maskDriven( + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords &local_m, + const Coords &local_a, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > &m_vector, + const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper, + const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper, + const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper, + const Ring &ring + ); + + template< + Descriptor descr, + bool a_scalar, + bool x_scalar, + bool y_scalar, + bool y_zero, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords, + class Ring + > + RC boolean_dispatcher_sparse_eWiseMulAdd_maskDriven( + const bool already_dense_output, + const bool already_dense_mask, + const bool already_dense_input_a, + const bool already_dense_input_x, + const bool already_dense_input_y, + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords &local_m, + const Coords &local_a, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > &m_vector, + const internal::Wrapper< a_scalar, InputType1, Coords > &a_wrapper, + const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper, + const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper, + const Ring &ring + ) { + if( already_dense_output ) { + if( already_dense_mask ) { + if( already_dense_input_a ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, true, true, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, true, true, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, true, true, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, true, true, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, true, false, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, true, false, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, true, false, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, true, false, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } + } + } else { + if( already_dense_input_a ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, false, true, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, false, true, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, false, true, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, false, true, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, false, false, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, false, false, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, false, false, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + true, false, false, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } + } + } + } else { + if( already_dense_mask ) { + if( already_dense_input_a ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, true, true, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, true, true, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, true, true, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, true, true, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, true, false, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, true, false, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, true, false, false, true + >( lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, true, false, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } + } + } else { + if( already_dense_input_a ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, false, true, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, false, true, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, false, true, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, false, true, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, false, false, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, false, false, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, false, false, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } else { + return internal::sparse_eWiseMulAdd_maskDriven< + descr, a_scalar, x_scalar, y_scalar, y_zero, + false, false, false, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_wrapper, x_wrapper, y_wrapper, ring + ); + } + } + } + } + } + } + + template< + Descriptor descr, + bool masked, + bool x_scalar, + bool y_scalar, + bool y_zero, + bool mulSwitched, + bool already_dense_output, + bool already_dense_mask, + bool already_dense_input_a, + bool already_dense_input_x, + bool already_dense_input_y, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords, + class Ring + > + RC twoPhase_sparse_eWiseMulAdd_mulDriven( + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords * const local_m, + const Coords &local_a, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > * const m_vector, + const Vector< InputType1, ascend, Coords > &a_vector, + const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper, + const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper, + const Ring &ring + ); + + template< + Descriptor descr, + bool masked, + bool x_scalar, + bool y_scalar, + bool y_zero, + bool mulSwitched, + typename OutputType, + typename MaskType, + typename InputType1, + typename InputType2, + typename InputType3, + typename Coords, + class Ring + > + RC boolean_dispatcher_twoPhase_sparse_eWiseMulAdd_mulDriven( + const bool already_dense_output, + const bool already_dense_mask, + const bool already_dense_input_a, + const bool already_dense_input_x, + const bool already_dense_input_y, + const size_t lower_bound, + const size_t upper_bound, + Coords &local_z, + const Coords * const local_m, + const Coords &local_a, + const Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &z_vector, + const Vector< MaskType, ascend, Coords > * const m_vector, + const Vector< InputType1, ascend, Coords > &a_vector, + const internal::Wrapper< x_scalar, InputType2, Coords > &x_wrapper, + const internal::Wrapper< y_scalar, InputType3, Coords > &y_wrapper, + const Ring &ring = Ring() + ) { + if( already_dense_output ) { + if( already_dense_mask ) { + if( already_dense_input_a ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, true, true, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, true, true, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, true, true, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, true, true, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, true, false, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, true, false, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, true, false, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, true, false, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } + } + } else { + if( already_dense_input_a ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, false, true, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, false, true, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, false, true, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, false, true, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, false, false, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, false, false, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, false, false, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + true, false, false, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } + } + } + } else { + if( already_dense_mask ) { + if( already_dense_input_a ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, true, true, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, true, true, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, true, true, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, true, true, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, true, false, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, true, false, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, true, false, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, true, false, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } + } + } else { + if( already_dense_input_a ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, false, true, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, false, true, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, false, true, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, false, true, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } + } else { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, false, false, true, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, false, false, true, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } else { + if( already_dense_input_y ) { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, false, false, false, true + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } else { + return internal::twoPhase_sparse_eWiseMulAdd_mulDriven< + descr, masked, x_scalar, y_scalar, y_zero, mulSwitched, + false, false, false, false, false + >( + lower_bound, upper_bound, local_z, local_m, local_a, local_x, local_y, + z_vector, m_vector, a_vector, x_wrapper, y_wrapper, ring + ); + } + } + } + } + } + } + + template< + Descriptor descr, + bool already_dense_input_x, + bool already_dense_input_y, + class AddMonoid, + class AnyOp, + typename InputType1, + typename InputType2, + typename Coords + > + RC sparse_dot_generic( + typename AddMonoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_x, + const Coords &local_y, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const size_t local_nz, + const AddMonoid &addMonoid, + const AnyOp &anyOp + ); + + template< + Descriptor descr, + class AddMonoid, + class AnyOp, + typename InputType1, + typename InputType2, + typename Coords + > + RC boolean_dispatcher_sparse_dot_generic( + const bool already_dense_input_x, + const bool already_dense_input_y, + typename AddMonoid::D3 &thread_local_output, + const size_t lower_bound, + const size_t upper_bound, + const Coords &local_x, + const Coords &local_y, + const Vector< InputType1, ascend, Coords > &x, + const Vector< InputType2, ascend, Coords > &y, + const size_t local_nz, + const AddMonoid &addMonoid, + const AnyOp &anyOp + ) { + if( already_dense_input_x ) { + if( already_dense_input_y ) { + return internal::sparse_dot_generic< + descr, true, true + >( + thread_local_output, lower_bound, upper_bound, local_x, local_y, + x, y, local_nz, addMonoid, anyOp + ); + } else { + return internal::sparse_dot_generic< + descr, true, false + >( + thread_local_output, lower_bound, upper_bound, local_x, local_y, + x, y, local_nz, addMonoid, anyOp + ); + } + } else { + if( already_dense_input_y ) { + return internal::sparse_dot_generic< + descr, false, true + >( + thread_local_output, lower_bound, upper_bound, local_x, local_y, + x, y, local_nz, addMonoid, anyOp + ); + } else { + return internal::sparse_dot_generic< + descr, false, false + >( + thread_local_output, lower_bound, upper_bound, local_x, local_y, + x, y, local_nz, addMonoid, anyOp + ); + } + } + } + + } // end namespace ``internal'' + +} // end namespace ``grb'' + +#endif + diff --git a/include/graphblas/ascend/boolean_dispatcher_blas2.hpp b/include/graphblas/ascend/boolean_dispatcher_blas2.hpp new file mode 100644 index 000000000..8a846672c --- /dev/null +++ b/include/graphblas/ascend/boolean_dispatcher_blas2.hpp @@ -0,0 +1,191 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Dispatchers for the level-2 primitives + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_BOOLEAN_DISPATCHER_BLAS2 +#define _H_GRB_ASCEND_BOOLEAN_DISPATCHER_BLAS2 + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "coordinates.hpp" +#include "vector.hpp" + + +namespace grb { + + namespace internal { + + template< + Descriptor descr, + bool masked, + bool input_masked, + bool left_handed, + template< typename > class One, + bool already_dense_destination_vector, + bool already_dense_mask_vector, + class AdditiveMonoid, + class Multiplication, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename InputType4, + typename Coords, + typename RowColType, + typename NonzeroType + > + inline void vxm_inner_kernel_gather( + RC &rc, + const size_t lower_bound, + Coords &local_destination_vector, + const Coords &local_mask_vector, + Vector< IOType, ascend, Coords > &destination_vector, + IOType &destination_element, + const size_t &destination_index, + const Vector< InputType1, ascend, Coords > &source_vector, + const InputType1 * __restrict__ const &source, + const size_t &source_range, + const internal::Compressed_Storage< + InputType2, RowColType, NonzeroType + > &matrix, + const Vector< InputType3, ascend, Coords > &mask_vector, + const InputType3 * __restrict__ const &mask, + const Vector< InputType4, ascend, Coords > &source_mask_vector, + const InputType4 * __restrict__ const &source_mask, + const AdditiveMonoid &add, + const Multiplication &mul, + const std::function< size_t( size_t ) > &src_local_to_global, + const std::function< size_t( size_t ) > &src_global_to_local, + const std::function< size_t( size_t ) > &dst_local_to_global + ); + + template< + Descriptor descr, + bool masked, + bool input_masked, + bool left_handed, + template< typename > class One, + class AdditiveMonoid, + class Multiplication, + typename IOType, + typename InputType1, + typename InputType2, + typename InputType3, + typename InputType4, + typename Coords, + typename RowColType, + typename NonzeroType + > + inline void boolean_dispatcher_vxm_inner_kernel_gather( + const bool already_dense_destination_vector, + const bool already_dense_mask_vector, + RC &rc, + const size_t lower_bound, + Coords &local_destination_vector, + const Coords &local_mask_vector, + Vector< IOType, ascend, Coords > &destination_vector, + IOType &destination_element, + const size_t &destination_index, + const Vector< InputType1, ascend, Coords > &source_vector, + const InputType1 * __restrict__ const &source, + const size_t &source_range, + const internal::Compressed_Storage< + InputType2, RowColType, NonzeroType + > &matrix, + const Vector< InputType3, ascend, Coords > &mask_vector, + const InputType3 * __restrict__ const &mask, + const Vector< InputType4, ascend, Coords > &source_mask_vector, + const InputType4 * __restrict__ const &source_mask, + const AdditiveMonoid &add, + const Multiplication &mul, + const std::function< size_t( size_t ) > &src_local_to_global, + const std::function< size_t( size_t ) > &src_global_to_local, + const std::function< size_t( size_t ) > &dst_local_to_global + ) { + if( already_dense_destination_vector ) { + if( already_dense_mask_vector ) { + return internal::vxm_inner_kernel_gather< + descr, masked, input_masked, left_handed, One, + true, true + >( + rc, lower_bound, local_destination_vector, local_mask_vector, + destination_vector, destination_element, destination_index, + source_vector, source, source_range, matrix, mask_vector, mask, + source_mask_vector, source_mask, add, mul, + src_local_to_global, src_global_to_local, dst_local_to_global + ); + } else { + return internal::vxm_inner_kernel_gather< + descr, masked, input_masked, left_handed, One, + true, false + >( + rc, lower_bound, local_destination_vector, local_mask_vector, + destination_vector, destination_element, destination_index, + source_vector, source, source_range, matrix, mask_vector, mask, + source_mask_vector, source_mask, add, mul, + src_local_to_global, src_global_to_local, dst_local_to_global + ); + } + } else { + if( already_dense_mask_vector ) { + return internal::vxm_inner_kernel_gather< + descr, masked, input_masked, left_handed, One, + false, true + >( + rc, lower_bound, local_destination_vector, local_mask_vector, + destination_vector, destination_element, destination_index, + source_vector, source, source_range, matrix, mask_vector, mask, + source_mask_vector, source_mask, add, mul, + src_local_to_global, src_global_to_local, dst_local_to_global + ); + } else { + return internal::vxm_inner_kernel_gather< + descr, masked, input_masked, left_handed, One, + false, false + >( + rc, lower_bound, local_destination_vector, local_mask_vector, + destination_vector, destination_element, destination_index, + source_vector, source, source_range, matrix, mask_vector, mask, + source_mask_vector, source_mask, add, mul, + src_local_to_global, src_global_to_local, dst_local_to_global + ); + } + } + } + + } // end namespace ``internal'' + +} // end namespace ``grb'' + +#endif + diff --git a/include/graphblas/ascend/boolean_dispatcher_io.hpp b/include/graphblas/ascend/boolean_dispatcher_io.hpp new file mode 100644 index 000000000..d298cb38a --- /dev/null +++ b/include/graphblas/ascend/boolean_dispatcher_io.hpp @@ -0,0 +1,362 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Dispatchers for the ascend I/O primitives. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_BOOLEAN_DISPATCHER_IO +#define _H_GRB_ASCEND_BOOLEAN_DISPATCHER_IO + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "coordinates.hpp" +#include "vector.hpp" + + +namespace grb { + + namespace internal { + + template< + Descriptor descr, + bool loop_over_vector_length, + bool already_dense_mask, + bool mask_is_dense, + typename DataType, + typename MaskType, + typename T, + typename Coords + > + RC masked_set( + const size_t lower_bound, + const size_t upper_bound, + Coords &local_x, + const Coords &local_mask, + Vector< DataType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + const T val + ); + + template< + Descriptor descr, + typename DataType, + typename MaskType, + typename T, + typename Coords + > + RC boolean_dispatcher_masked_set( + const bool loop_over_vector_length, + const bool already_dense_mask, + const bool mask_is_dense, + const size_t lower_bound, + const size_t upper_bound, + Coords &local_x, + const Coords &local_mask, + Vector< DataType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + const T val + ) { + if( loop_over_vector_length ) { + if( already_dense_mask ) { + if( mask_is_dense ) { + return internal::masked_set< + descr, true, true, true + >( lower_bound, upper_bound, local_x, local_mask, x, m, val ); + } else { + return internal::masked_set< + descr, true, true, false + >( lower_bound, upper_bound, local_x, local_mask, x, m, val ); + } + } else { + if( mask_is_dense ) { + return internal::masked_set< + descr, true, false, true + >( lower_bound, upper_bound, local_x, local_mask, x, m, val ); + } else { + return internal::masked_set< + descr, true, false, false + >( lower_bound, upper_bound, local_x, local_mask, x, m, val ); + } + } + } else { + if( already_dense_mask ) { + if( mask_is_dense ) { + return internal::masked_set< + descr, false, true, true + >( lower_bound, upper_bound, local_x, local_mask, x, m, val ); + } else { + return internal::masked_set< + descr, false, true, false + >( lower_bound, upper_bound, local_x, local_mask, x, m, val ); + } + } else { + if( mask_is_dense ) { + return internal::masked_set< + descr, false, false, true + >( lower_bound, upper_bound, local_x, local_mask, x, m, val ); + } else { + return internal::masked_set< + descr, false, false, false + >( lower_bound, upper_bound, local_x, local_mask, x, m, val ); + } + } + } + } + + template< + Descriptor descr, + bool out_is_void, + bool in_is_void, + bool sparse, + bool already_dense_vectors, + bool already_dense_input, + typename OutputType, + typename InputType, + typename Coords + > + RC set_generic( + const size_t lower_bound, + const size_t upper_bound, + Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &x, + const Vector< InputType, ascend, Coords > &y + ); + + template< Descriptor descr, + bool out_is_void, + bool in_is_void, + bool sparse, + typename OutputType, + typename InputType, + typename Coords + > + RC boolean_dispatcher_set_generic( + const bool already_dense_vectors, + const bool already_dense_input, + const size_t lower_bound, + const size_t upper_bound, + Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &x, + const Vector< InputType, ascend, Coords > &y + ) { + if( already_dense_vectors ) { + if( already_dense_input ) { + return internal::set_generic< + descr, out_is_void, in_is_void, sparse, + true, true + >( lower_bound, upper_bound, local_x, local_y, x, y ); + } else { + return internal::set_generic< + descr, out_is_void, in_is_void, sparse, + true, false + >( lower_bound, upper_bound, local_x, local_y, x, y ); + } + } else { + if( already_dense_input ) { + return internal::set_generic< + descr, out_is_void, in_is_void, sparse, + false, true + >( lower_bound, upper_bound, local_x, local_y, x, y ); + } else { + return internal::set_generic< + descr, out_is_void, in_is_void, sparse, + false, false + >( lower_bound, upper_bound, local_x, local_y, x, y ); + } + } + } + + template< + Descriptor descr, + bool out_is_void, + bool in_is_void, + bool loop_over_y, + bool already_dense_input_y, + bool already_dense_mask, + bool mask_is_dense, + typename OutputType, + typename MaskType, + typename InputType, + typename Coords + > + RC masked_set( + const size_t lower_bound, + const size_t upper_bound, + Coords &local_x, + const Coords &local_mask, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &mask, + const Vector< InputType, ascend, Coords > &y + ); + + template< + Descriptor descr, + bool out_is_void, + bool in_is_void, + typename OutputType, + typename MaskType, + typename InputType, + typename Coords + > + RC boolean_dispatcher_masked_set( + const bool loop_over_y, + const bool already_dense_input_y, + const bool already_dense_mask, + const bool mask_is_dense, + const size_t lower_bound, + const size_t upper_bound, + Coords &local_x, + const Coords &local_mask, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &mask, + const Vector< InputType, ascend, Coords > &y + ) { + if( loop_over_y ) { + if( already_dense_input_y ) { + if( already_dense_mask ) { + if( mask_is_dense ) { + return internal::masked_set< + descr, out_is_void, in_is_void, + true, true, true, true + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } else { + return internal::masked_set< + descr, out_is_void, in_is_void, + true, true, true, false + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } + } else { + if( mask_is_dense ) { + return internal::masked_set< + descr, out_is_void, in_is_void, + true, true, false, true + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } else { + return internal::masked_set< + descr, out_is_void, in_is_void, + true, true, false, false + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } + } + } else { + if( already_dense_mask ) { + if( mask_is_dense ) { + return internal::masked_set< + descr, out_is_void, in_is_void, + true, false, true, true + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } else { + return internal::masked_set< + descr, out_is_void, in_is_void, + true, false, true, false + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } + } else { + if( mask_is_dense ) { + return internal::masked_set< + descr, out_is_void, in_is_void, + true, false, false, true + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } else { + return internal::masked_set< + descr, out_is_void, in_is_void, + true, false, false, false + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } + } + } + } else { + if( already_dense_input_y ) { + if( already_dense_mask ) { + if( mask_is_dense ) { + return internal::masked_set< + descr, out_is_void, in_is_void, + false, true, true, true + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } else { + return internal::masked_set< + descr, out_is_void, in_is_void, + false, true, true, false + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } + } else { + if( mask_is_dense ) { + return internal::masked_set< + descr, out_is_void, in_is_void, + false, true, false, true + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } else { + return internal::masked_set< + descr, out_is_void, in_is_void, + false, true, false, false + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } + } + } else { + if( already_dense_mask ) { + if( mask_is_dense ) { + return internal::masked_set< + descr, out_is_void, in_is_void, + false, false, true, true + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } else { + return internal::masked_set< + descr, out_is_void, in_is_void, + false, false, true, false + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } + } else { + if( mask_is_dense ) { + return internal::masked_set< + descr, out_is_void, in_is_void, + false, false, false, true + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } else { + return internal::masked_set< + descr, out_is_void, in_is_void, + false, false, false, false + >( lower_bound, upper_bound, local_x, local_mask, local_y, x, mask, y ); + } + } + } + } + } + + } // end namespace ``internal'' + +} // end namespace ``grb'' + +#endif + diff --git a/include/graphblas/ascend/collectives.hpp b/include/graphblas/ascend/collectives.hpp new file mode 100644 index 000000000..2ff096166 --- /dev/null +++ b/include/graphblas/ascend/collectives.hpp @@ -0,0 +1,91 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Collectives implementation for the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_COLL +#define _H_GRB_ASCEND_COLL + +#include + +#include +#include +#include +#include + + +namespace grb { + + /** The collectives class is based on that of the reference backend */ + template<> + class collectives< ascend > { + + private: + + /** Disallow instantiation of this class. */ + collectives() {} + + + public: + + template< + Descriptor descr = descriptors::no_operation, + class Operator, typename IOType + > + static RC allreduce( IOType &inout, const Operator op = Operator() ) { + return collectives< reference >::allreduce< descr, Operator, IOType >( + inout, op ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Operator, typename IOType + > + static RC reduce( + IOType &inout, const size_t root = 0, const Operator op = Operator() + ) { + return collectives< reference >::reduce< descr, Operator, IOType >( inout, + root, op ); + } + + template< typename IOType > + static RC broadcast( IOType &inout, const size_t root = 0 ) { + return collectives< reference >::broadcast< IOType >( inout, root ); + } + + template< Descriptor descr = descriptors::no_operation, typename IOType > + static RC broadcast( + IOType * inout, const size_t size, + const size_t root = 0 + ) { + return collectives< reference >::broadcast< descr, IOType >( inout, size, + root ); + } + + }; // end class `collectives< ascend >' + +} // namespace grb + +#endif // end ``_H_GRB_ASCEND_COLL'' + diff --git a/include/graphblas/ascend/config.hpp b/include/graphblas/ascend/config.hpp new file mode 100644 index 000000000..0e468af5a --- /dev/null +++ b/include/graphblas/ascend/config.hpp @@ -0,0 +1,148 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Configuration settings for the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_CONFIG +#define _H_GRB_ASCEND_CONFIG + +#include +#include + + +namespace grb { + + /** + * \defgroup ascendConfig Nonblocking backend configuration + * + * \ingroup config + * + * All configuration parameters for the #grb::ascend backend. + * + * @{ + */ + + namespace config { + + /** + * The various supported Ascend boards. + */ + enum Ascend { + Ascend_910A, + Ascend_910B + }; + + /** + * Class with information about the Ascend cache/scratchpad hierarchy. + */ + //template< enum Ascend = _ASC_DEFAULT_TARGET > TODO FIXME no way to get this passed in from alpcxx / grbcxx + template< enum Ascend = Ascend_910B > // Assuming 910B default instead + class ASCEND_CACHE_HIERARCHY {}; + + /** + * Cache hierarchy parameters for the 910A. + */ + template<> + class ASCEND_CACHE_HIERARCHY< Ascend_910A > { + public: + static constexpr const size_t UB_SIZE = 8192; + }; + + /** + * Cache hierarchy parameters for the 910B. + */ + template<> + class ASCEND_CACHE_HIERARCHY< Ascend_910B > { + public: + static constexpr const size_t UB_SIZE = 8192; + }; + + /** + * Implementation-dependent configuration parameters for the \a ascend + * backend. + * + * \note The user documentation only specifies the fields that under some + * circumstances may benefit from a user adapting it. For viewing all + * fields, please see the developer documentation. + * + * \note Adapting the fields should be done with care and may require + * re-compilation and re-installation of the ALP framework. + * + * \ingroup ascendConfig + * + * @see grb::config::IMPLEMENTATION + */ + template<> + class IMPLEMENTATION< ascend > { + + public: + + /** + * A private memory segment shall never be accessed by threads other than + * the thread who allocates it. Therefore we choose aligned mode here. + */ + static constexpr ALLOC_MODE defaultAllocMode() { + return ALLOC_MODE::ALIGNED; + } + + /** + * For the ascend backend, a shared memory-segment should use + * interleaved alloc so that any thread has uniform access on average. + */ + static constexpr ALLOC_MODE sharedAllocMode() { + return ALLOC_MODE::INTERLEAVED; + } + + /** + * \internal + * By default, use the coordinates of the selected backend. + * + * \note This is an extension that may, at some later stage, be used for + * composability with the #grb::bsp1d and #grb::hybrid backends. + * \endinternal + */ + static constexpr Backend coordinatesBackend() { + return IMPLEMENTATION< nonblocking >::coordinatesBackend(); + } + + /** + * \internal + * Whether the backend has vector capacities always fixed to their + * defaults. + * \endinternal + */ + static constexpr bool fixedVectorCapacities() { + return true; + } + + }; + + } // namespace config + + /** @} */ + +} // namespace grb + +#endif // end ``_H_GRB_ASCEND_CONFIG'' + diff --git a/include/graphblas/ascend/coordinates.hpp b/include/graphblas/ascend/coordinates.hpp new file mode 100644 index 000000000..9f04187d2 --- /dev/null +++ b/include/graphblas/ascend/coordinates.hpp @@ -0,0 +1,701 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Coordinates for the Ascend backend + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_COORDINATES +#define _H_GRB_ASCEND_COORDINATES + +#include //std::runtime_error +#include +#if defined _DEBUG && !defined NDEBUG + #include +#endif + +#include //size_t +#include + +#include +#include +#include + +#include + +#include + +#include + +#include + +#include + + +namespace grb { + + namespace internal { + + /** + * The Coordinates class is based on that of the reference backend. + * A set of new methods is added to handle local coordinates used + * by the ascend backend. The bufferSize method used by the + * Matrix class relies on parbufSize and prefixbufSize that have + * their own implementation for the ascend backend. + */ + template<> + class Coordinates< ascend > { + + public: + + typedef typename config::VectorIndexType StackType; + + typedef bool ArrayType; + + + private: + + bool * __restrict__ _assigned; + + StackType * __restrict__ _stack; + + StackType * __restrict__ _buffer; + + size_t _n; + + size_t _cap; + + size_t _buf; + + // pointers to the data of the local coordinates mechanism + std::vector< config::VectorIndexType * > local_buffer; + config::VectorIndexType * __restrict__ local_new_nnzs; + config::VectorIndexType * __restrict__ pref_sum; + + // the analytic model used during the execution of a pipeline + AnalyticModel analytic_model; + + + public: + + static inline size_t arraySize( const size_t dim ) noexcept { + if( dim == 0 ) { + return 0; + } + return ( dim + 1 ) * sizeof( ArrayType ); + } + + static inline size_t stackSize( const size_t dim ) noexcept { + if( dim == 0 ) { + return 0; + } + return ( dim + 1 ) * sizeof( StackType ); + } + + static inline size_t prefixbufSize() noexcept { + int P = 1; + return ( P + 1 ) * sizeof( StackType ); + } + + static inline size_t parbufSize( const size_t n ) noexcept { + return n * sizeof( StackType ); + } + + static inline size_t bufferSize( const size_t dim ) noexcept { + size_t ret = stackSize( dim ); + ret += parbufSize( dim ); + ret += prefixbufSize(); + return ret; + } + + inline Coordinates() noexcept : + _assigned( nullptr ), _stack( nullptr ), _buffer( nullptr ), + _n( 0 ), _cap( 0 ), _buf( 0 ) + {} + + inline Coordinates( Coordinates< ascend > &&x ) noexcept : + _assigned( x._assigned ), _stack( x._stack ), _buffer( x._buffer ), + _n( x._n ), _cap( x._cap ), _buf( x._buf ) + { + x._assigned = nullptr; + x._stack = nullptr; + x._buffer = nullptr; + x._n = x._cap = x._buf = 0; + } + + inline Coordinates( const Coordinates< ascend > &x ) noexcept : + _assigned( x._assigned ), _stack( x._stack ), _buffer( x._buffer ), + _n( x._n ), _cap( x._cap ), _buf( x._buf ) + { + assert( this != &x ); + } + + inline Coordinates< ascend > & operator=( + const Coordinates< ascend > &other + ) { + Coordinates replace( other ); + *this = std::move( replace ); + return *this; + } + + inline Coordinates< ascend > & operator=( + Coordinates< ascend > &&x + ) noexcept { + assert( this != &x ); + _assigned = x._assigned; + _stack = x._stack; + _buffer = x._buffer; + _n = x._n; + _cap = x._cap; + _buf = x._buf; + x._assigned = nullptr; + x._stack = x._buffer = nullptr; + x._n = x._cap = x._buf = 0; + return *this; + } + + inline ~Coordinates() noexcept { + // done (the #_assigned and #_stack memory + // blocks are not managed by this class) + } + + void set( + void * const arr, bool arr_initialized, + void * const buf, const size_t dim, bool parallel = true + ) noexcept { + // catch trivial case + if( arr == nullptr || buf == nullptr ) { + assert( arr == nullptr ); + assert( buf == nullptr ); + assert( dim == 0 ); + _assigned = nullptr; + _stack = nullptr; + _buffer = nullptr; + _n = 0; + _cap = 0; + _buf = 0; + return; + } + + // _assigned has no alignment issues, take directly from input buffer + assert( reinterpret_cast< uintptr_t >( _assigned ) % sizeof( bool ) == 0 ); + _assigned = static_cast< bool * >( arr ); + // ...but _stack does have potential alignment issues: + char * buf_raw = static_cast< char * >( buf ); + constexpr const size_t size = sizeof( StackType ); + const size_t mod = reinterpret_cast< uintptr_t >( buf_raw ) % size; + if( mod != 0 ) { + buf_raw += size - mod; + } + _stack = reinterpret_cast< StackType * >( buf_raw ); + // no alignment issues between stack and buffer, so just shift by dim: + _buffer = _stack + dim; + // initialise + _n = 0; + _cap = dim; + _buf = 0; + + // and initialise _assigned (but only if necessary) + if( dim > 0 && !arr_initialized ) { + if( parallel ) { + #pragma omp parallel + { + size_t start, end; + config::OMP::localRange( start, end, 0, dim ); + for( size_t i = start; i < end; ++i ) { + _assigned[ i ] = false; + } + } + } else { + for( size_t i = 0; i < dim; ++i ) { + _assigned[ i ] = false; + } + } + } + } + + inline bool assign( const size_t i ) noexcept { + if( _n == _cap ) { + return true; + } + if( !_assigned[ i ] ) { + _assigned[ i ] = true; + const size_t newSize = _n + 1; + assert( _n <= _cap ); + assert( newSize <= _cap ); + _stack[ _n ] = i; + _n = newSize; + return false; + } else { + return true; + } + } + + template< bool maybe_invalid = false > + inline void local_assignAll( ) noexcept { + if( maybe_invalid || _n != _cap ) { + if( _assigned != nullptr ) { + assert( _stack != nullptr ); + assert( maybe_invalid || _n < _cap ); + assert( !maybe_invalid || _n <= _cap ); + _n = _cap; + + for( size_t i = 0; i < _n; ++i ) { + _assigned[ i ] = true; + _stack[ i ] = i; + } + } + } + + // the counter of initial nonzeroes in the local stack is stored in the + // buffer immediately before the local stack + StackType * __restrict__ local_nnzs = _stack - 1; + + // the counter for the local stack must be set to zero such that the number + // of new nonzeroes will be set to _n by asyncJoinSubset and joinSubset + // will update the global stack based on the local_new_nnzs counter the + // global stack has become empty and _assigned = false so the local + // coordinates of this tile must be added in the global stack from scratch + // regardless whether this tile was already dense or not as it is hard to + // know which part of the global stack contains the coordinates of this + // tile + *local_nnzs = 0; + } + + template< bool maybe_invalid = false > + inline void local_assignAllNotAlreadyAssigned( ) noexcept { + if( maybe_invalid || _n != _cap ) { + if( _assigned != nullptr ) { + assert( _stack != nullptr ); + assert( maybe_invalid || _n < _cap ); + assert( !maybe_invalid || _n <= _cap ); + + // searching for the not already assigned elements and add them to the + // local stack such that joinSubset will add to the global stack only + // those elements that are not already assigned + for( size_t i = 0; i < _cap; ++i ) { + if( !_assigned[ i ] ) { + _assigned[ i ] = true; + _stack[ _n++ ] = i; + } + } + + assert( _n == _cap ); + } + } + } + + inline void clear() noexcept { + + if( _n == _cap ) { +#ifndef NDEBUG + if( _assigned == nullptr && _cap > 0 ) { + const bool dense_coordinates_may_not_call_clear = false; + assert( dense_coordinates_may_not_call_clear ); + } +#endif + + #pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() ) + for( size_t i = 0; i < _cap; ++i ) { + _assigned[ i ] = false; + } + } else { + if( _n < config::OMP::minLoopSize() ) { + for( size_t k = 0; k < _n; ++k ) { + _assigned[ _stack[ k ] ] = false; + } + } else { + #pragma omp parallel for schedule( dynamic, config::CACHE_LINE_SIZE::value() ) + for( size_t k = 0; k < _n; ++k ) { + _assigned[ _stack[ k ] ] = false; + } + } + } + _n = 0; + } + + inline void local_clear() noexcept { + + if( _n == _cap ) { +#ifndef NDEBUG + if( _assigned == nullptr && _cap > 0 ) { + const bool dense_coordinates_may_not_call_clear = false; + assert( dense_coordinates_may_not_call_clear ); + } +#endif + + for( size_t i = 0; i < _cap; ++i ) { + _assigned[ i ] = false; + } + } else { + for( size_t k = 0; k < _n; ++k ) { + _assigned[ _stack[ k ] ] = false; + } + } + _n = 0; + + // the counter of initial nonzeroes in the local stack is stored in the + // buffer immediately before the local stack + StackType * __restrict__ local_nnzs = _stack - 1; + + // the counter for the local stack must be set to zero such that any new + // assigned element will be written to the global stack + *local_nnzs = 0; + } + + inline void reset_global_nnz_counter() noexcept { + _n = 0; + } + + inline bool isEmpty() const noexcept { + if( _n == 0 ) { + return true; + } else { + return false; + } + } + + inline bool isDense() const noexcept { + return _n == _cap; + } + + inline size_t size() const noexcept { + return _cap; + } + + inline bool assigned( const size_t i ) const noexcept { + assert( i < _cap ); + return _n == _cap || _assigned[ i ]; + } + + template< Descriptor descr, typename T > + inline bool mask( const size_t i, const T * const val ) const noexcept { + assert( i < _cap ); + return utils::interpretMask< descr >( assigned( i ), val, i ); + } + + inline size_t nonzeroes() const noexcept { + assert( _n <= _cap ); + return _n; + } + + inline size_t index( const size_t k ) const noexcept { + assert( k < _n ); + return isDense() ? k : _stack[ k ]; + } + + void localCoordinatesInit( const AnalyticModel &am ) { + + analytic_model = am; + + const size_t nthreads = analytic_model.getNumThreads(); + const size_t tile_size = analytic_model.getTileSize(); + const size_t num_tiles = analytic_model.getNumTiles(); + + assert( num_tiles > 0 ); + assert( _buf >= 4 * num_tiles ); + + local_buffer.resize( analytic_model.getNumTiles() ); + + #pragma omp parallel for schedule(dynamic) num_threads(nthreads) + for( size_t tile_id = 0; tile_id < num_tiles; ++tile_id ) { + local_buffer[ tile_id ] = _buffer + tile_id * ( tile_size + 1 ); + } + + local_new_nnzs = _buffer + num_tiles * ( tile_size + 1 ); + pref_sum = _buffer + num_tiles * ( tile_size + 2 ); + } + + /** + * Initialises a Coordinate instance that refers to a subset of this + * coordinates instance. Multiple disjoint subsets may be retrieved + * and concurrently updated, up to a maximum of tiles given by + * #internal::ASCEND::maxBufferTiles(). + * + * Subsets must be contiguous. If one thread calls this function, all + * other threads must make a matching call. + * + * @param[in] lower_bound The start index of the contiguous subset + * (inclusive). + * @param[in] upper_bound The end index of the contiguous subset + * (exclusive). + */ + void asyncSubsetInit( + const size_t lower_bound, + const size_t upper_bound + ) noexcept { + if( _cap == 0 ) { + return; + } + + const size_t tile_id = lower_bound / analytic_model.getTileSize(); + + config::VectorIndexType *local_nnzs = local_buffer[ tile_id ]; + config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1; + + *local_nnzs = 0; + if( upper_bound - lower_bound < _n ) { + for( size_t i = lower_bound; i < upper_bound; ++i ) { + if( _assigned[ i ] ) { + local_stack[ (*local_nnzs)++ ] = i - lower_bound; + } + } + } else { + for( size_t i = 0; i < _n; ++i ) { + const size_t k = _stack[ i ]; + if( lower_bound <= k && k < upper_bound ) { + assert( _assigned[ k ] ); + local_stack[ (*local_nnzs)++ ] = k - lower_bound; + } + } + } + + // the number of new nonzeroes is initialized here + local_new_nnzs[ tile_id ] = 0; + } + + /** + * Retrieves a subset coordinate instance that was previously initialised + * using a call to #asyncSubsetInit. + * + * @returns A Coordinates instance that only supports sequential + * (synchronous) updates as well as all queries. + */ + Coordinates< ascend > asyncSubset( + const size_t lower_bound, const size_t upper_bound + ) const noexcept { + assert(_cap > 0); + + const size_t tile_id = lower_bound / analytic_model.getTileSize(); + + config::VectorIndexType *local_nnzs = local_buffer[ tile_id ]; + config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1; + + Coordinates< ascend > ret; + assert( upper_bound - lower_bound <= analytic_model.getTileSize() ); + + ret.set( _assigned + lower_bound, true, local_stack, + upper_bound - lower_bound, false ); + + // the number of new nonzeroes is used to determine the total number + // of nonzeroes for the given local coordinates, since some of the + // nonzeroes are already written on the local statck + ret._n = (*local_nnzs) + local_new_nnzs[ tile_id ]; + assert( ret._n <= ret._cap ); + + ret._buf = 0; + + return ret; + } + + /** + * Saves the state of a subset Coordinates instance. Can be retrieved later + * once again via a call to #asyncSubset. New nonzeroes will be committed + * to the global coordinate structure via a call to #joinSubset, which will + * furthermore set the related tile to inactive. + */ + void asyncJoinSubset( + const Coordinates< ascend > &subset, + const size_t lower_bound, const size_t upper_bound + ) { + assert( _cap > 0 ); + + (void) upper_bound; + + const size_t tile_id = lower_bound / analytic_model.getTileSize(); + + config::VectorIndexType *local_nnzs = local_buffer[ tile_id ]; + + assert( subset._n <= subset._cap ); + assert( (*local_nnzs) <= subset._cap ); + + local_new_nnzs[ tile_id ] = subset._n - (*local_nnzs); + } + + bool newNonZeroes() const { + + if( _cap == 0 ) { + return false; + } + + const size_t num_tiles = analytic_model.getNumTiles(); + + for( size_t i = 0; i < num_tiles; i++ ) { + if( local_new_nnzs[ i ] > 0 ) { + return true; + } + } + return false; + } + + void prefixSumComputation() { + + const size_t num_tiles = analytic_model.getNumTiles(); + + // takes into accout the size of data for each iteration of the prefix sum + // computation which is used to determine the number of parallel task that + // should be used such that the data of each parallel task fit in the L1 + // cache + constexpr size_t size_of_data = sizeof( pref_sum[0] ) + + sizeof( local_new_nnzs[0] ); + + // make use of the analytic model to estimate a proper number of threads + // and a tile size + AnalyticModel am( size_of_data, num_tiles, 1 ); + + const size_t nthreads = am.getNumThreads(); + const size_t prefix_sum_tile_size = am.getTileSize(); + const size_t prefix_sum_num_tiles = am.getNumTiles(); + + // make a run-time decision to choose between sequential and parallel + // prefix sum implementation the sequential prefix sum implementation is + // more efficient for a small number of tiles + if( num_tiles < prefix_sum_tile_size ) { + // sequential computation of the prefix sum + pref_sum[ 0 ] = _n + local_new_nnzs[ 0 ]; + for( size_t i = 1; i < num_tiles; i++ ) { + pref_sum[ i ] = pref_sum[ i - 1 ] + local_new_nnzs[ i ]; + } + } else { + // parallel computation of the prefix sum + size_t local_prefix_sum[ prefix_sum_num_tiles ]; + + #pragma omp parallel num_threads(nthreads) + { + #pragma omp for + for( size_t id = 0; id < prefix_sum_num_tiles; id++ ) { + + size_t lower, upper; + config::OMP::localRange( lower, upper, 0, num_tiles, + prefix_sum_tile_size, id, prefix_sum_num_tiles ); + + // the number of threads used for parallel computation must not exceed + // num_tiles, otherwise the code below results in data races + assert( id <= num_tiles ); + assert( id < prefix_sum_num_tiles - 1 || upper == num_tiles ); + assert( lower <= upper ); + assert( upper <= num_tiles ); + + pref_sum[ lower ] = local_new_nnzs[ lower ]; + for( size_t i = lower + 1; i < upper; i++ ) { + pref_sum[ i ] = pref_sum[ i - 1 ] + local_new_nnzs[ i ]; + } + + // each thread stores the prefix sum of its last element in + // local_prefix_sum + // the memory location is specified by the identifier of the thread to + // avoid data races + local_prefix_sum[ id ] = pref_sum[ upper - 1 ]; + } + + // here, there is an implicit barrier that ensures all threads have + // already written the local prefix sum for each parallel task + + // a single threads computes the prefix sum for the last element of each + // thread + #pragma omp single + { + for( size_t i = 1; i < prefix_sum_num_tiles; i++ ) { + local_prefix_sum[ i ] += local_prefix_sum[ i - 1 ]; + } + } + + #pragma omp for + for(size_t id = 0; id < prefix_sum_num_tiles; id++ ) { + + size_t lower, upper; + config::OMP::localRange( lower, upper, 0, num_tiles, + prefix_sum_tile_size, id, prefix_sum_num_tiles ); + + // the first thread (id=0) needs to add only the number of nonzeroes(_n) + const size_t acc = _n + ( ( id > 0 ) ? local_prefix_sum[ id - 1 ] : 0 ); + for( size_t i = lower; i < upper; i++ ) { + pref_sum[ i ] += acc; + } + } + } + +#ifdef _DEBUG + // ensures that the parallel implementation computes the same result + // with the following sequential implementation + size_t seq_offsets[ num_tiles ]; + seq_offsets[ 0 ] = _n + local_new_nnzs[ 0 ]; + for( size_t i = 1; i < num_tiles; i++ ) { + seq_offsets[ i ] = seq_offsets[ i - 1 ] + local_new_nnzs[ i ]; + } + + for( size_t i = 0; i < num_tiles; i++ ) { + assert( seq_offsets[i] == pref_sum[i] ); + } +#endif + } + + // a single thread updates the number of nonzeroes + // the last element of prefix_sum_ofssets alredy includes + // the current number of nonzeroes _n which was added earlier + _n = pref_sum[ num_tiles - 1 ]; + } + + /** + * Takes a currently active subset and commits it to the global storage. + * After completion the given active tile will be marked inactive. + */ + void joinSubset( const size_t lower_bound, const size_t upper_bound ) { + if( _cap == 0 ) { + return; + } +#ifdef NDEBUG + ( void )upper_bound; +#endif + const size_t tile_id = lower_bound / analytic_model.getTileSize(); + + config::VectorIndexType *local_nnzs = local_buffer[ tile_id ]; + config::VectorIndexType *local_stack = local_buffer[ tile_id ] + 1; + + const size_t local_stack_start = *local_nnzs; + const size_t local_stack_end = *local_nnzs + local_new_nnzs[ tile_id ]; + assert( local_stack_start <= local_stack_end ); + + size_t pos = pref_sum[ tile_id ] - local_new_nnzs[ tile_id ]; + + for( size_t k = local_stack_start; k < local_stack_end; ++k ) { + const size_t local_index = local_stack[ k ]; + const size_t global_index = local_index + lower_bound; + + assert( global_index >= lower_bound ); + assert( global_index < upper_bound ); + assert( _assigned[ global_index ] ); + assert( pos < _cap ); + + _stack[ pos++ ] = global_index; + } + + local_new_nnzs[ tile_id ] = 0; + } + }; + + } // namespace internal + +} // namespace grb + +#endif // end `_H_GRB_ASCEND_COORDINATES' + diff --git a/include/graphblas/ascend/data_utils.h b/include/graphblas/ascend/data_utils.h new file mode 100644 index 000000000..042de8bb0 --- /dev/null +++ b/include/graphblas/ascend/data_utils.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +template +void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow=16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // DATA_UTILS_H diff --git a/include/graphblas/ascend/exec.hpp b/include/graphblas/ascend/exec.hpp new file mode 100644 index 000000000..c7807c6c5 --- /dev/null +++ b/include/graphblas/ascend/exec.hpp @@ -0,0 +1,104 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Implements the launcher for the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_EXEC +#define _H_GRB_ASCEND_EXEC + +#include +#include + +#include "init.hpp" + + +namespace grb { + + /** The Launcher class is based on that of the reference backend */ + template< EXEC_MODE mode > + class Launcher< mode, ascend > { + + private: + + Launcher< mode, reference > ref; + + public: + + /** + * This implementation only accepts a single user process. It ignores + * \a hostname and \a port. + */ + Launcher( + const size_t process_id = 0, + const size_t nprocs = 1, + const std::string hostname = "localhost", + const std::string port = "0" + ) { + // ignore hostname and port + (void) hostname; + (void) port; + // sanity checks + if( nprocs != 1 ) { + throw std::invalid_argument( "Total number of user processes must be " + "exactly one when using the ascend implementation." + ); + } + if( process_id != 0 ) { + throw std::invalid_argument( "Process ID must always be zero in the " + "ascend implementation." + ); + } + } + + /** No implementation notes. */ + ~Launcher() {} + + /** exec is based on that of the reference backend */ + template< typename U > + RC exec( + void ( *grb_program )( const void *, const size_t, U & ), + const void * data_in, const size_t in_size, + U &data_out, const bool broadcast = false + ) const { + return ref.exec( grb_program, data_in, in_size, data_out, broadcast ); + } + + /** exec is based on that of the reference backend */ + template< typename T, typename U > + RC exec( + void ( *grb_program )( const T &, U & ), + const T &data_in, U &data_out, + const bool broadcast = false + ) { + return ref.exec( grb_program, data_in, data_out, broadcast ); + } + + /** finalize is based on that of the reference backend */ + grb::RC finalize() { return ref.finalize(); } + }; + +} // namespace grb + +#endif // end ``_H_GRB_ASCEND_EXEC'' + diff --git a/include/graphblas/ascend/forward.hpp b/include/graphblas/ascend/forward.hpp new file mode 100644 index 000000000..9d2e2fbec --- /dev/null +++ b/include/graphblas/ascend/forward.hpp @@ -0,0 +1,51 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Forward declarations required by the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_FORWARD +#define _H_GRB_ASCEND_FORWARD + + +namespace grb { + + // The eWiseLambda is a friend of matrix but defined in blas2. Therefore it is + // forward-declared and this forward definition file included from both + // matrix.hpp and blas2.hpp + template< + class ActiveDistribution = internal::Distribution< ascend >, + typename Func, typename DataType, + typename RIT, typename CIT, typename NIT + > + RC eWiseLambda( + const Func f, + const Matrix< DataType, ascend, RIT, CIT, NIT > &A, + const size_t s = 0, const size_t P = 1 + ); + // end eWiseLambda declarations + +} // namespace grb + +#endif // end ``_H_GRB_ASCEND_FORWARD'' + diff --git a/include/graphblas/ascend/grid.hpp b/include/graphblas/ascend/grid.hpp new file mode 100644 index 000000000..7b0255213 --- /dev/null +++ b/include/graphblas/ascend/grid.hpp @@ -0,0 +1,186 @@ +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_ALP_ASCEND_GRID +#define _H_ALP_ASCEND_GRID + +#include +#include + +#include +#include +#include +#include + +namespace alp +{ + namespace internal + { + extern AscendLazyEvaluation ale; + } +} + +namespace alp { + + namespace internal { + + class iGrid { + + private: + size_t process_order; + size_t problem_order; + + public: + iGrid( size_t proc, size_t prob ); + size_t getProcessOrder() const noexcept; + size_t getProblemOrder() const noexcept; + std::string processSize( const size_t k ) const noexcept; + std::string processMode( const size_t k ) const noexcept; + std::string problemSize( const size_t k ) const noexcept; + std::string problemMode( const size_t k ) const noexcept; + std::string problemMainMode( const size_t k ) const noexcept; + std::string problemTileMode( const size_t k ) const noexcept; + std::string tileSize( const size_t k ) const noexcept; + }; + + } + + /** + * Specific to the ALP/Ascend backend, this class maps problem spaces on + * process grids in a symbolic fashion. + */ + template< size_t process_order, size_t problem_order > + class Grid { + + private: + // problem mesh related state: +// std::vector< std::string > problem_sizes, problem_space, chunk_sizes; + + public: + Grid() noexcept; + std::string processSize( const size_t k ) const noexcept; + std::string processMode( const size_t k ) const noexcept; + std::string problemSize( const size_t k ) const noexcept; + std::string problemMode( const size_t k ) const noexcept; + std::string problemMainMode( const size_t k ) const noexcept; + std::string problemTileMode( const size_t k ) const noexcept; + std::string tileSize( const size_t k ) const noexcept; +// std::string chunkSize( const size_t k ) const noexcept; + grb::RC forEach( const std::vector< int > axes, const std::function < void( void ) > func ) const; + }; + + template< size_t process_order, size_t problem_order > + Grid< process_order, problem_order >::Grid() noexcept + { +// for( size_t k = 0; k < problem_order; ++k ) { +// chunk_sizes.push_back( "BLOCK_LENGTH" + k ); +// } + } + + template< size_t process_order, size_t problem_order > + std::string Grid< process_order, problem_order >::processSize( const size_t k ) const noexcept { + return "p" + std::to_string( k ); + } + + template< size_t process_order, size_t problem_order > + std::string Grid< process_order, problem_order >::processMode( const size_t k ) const noexcept { + return "a" + std::to_string( k ); + } + + template< size_t process_order, size_t problem_order > + std::string Grid< process_order, problem_order >::problemSize( const size_t k ) const noexcept { + return "n" + std::to_string( k ); + } + + template< size_t process_order, size_t problem_order > + std::string Grid< process_order, problem_order >::problemMode( const size_t k ) const noexcept { + return "i" + std::to_string( k ); + } + + template< size_t process_order, size_t problem_order > + std::string Grid< process_order, problem_order >::problemMainMode( const size_t k ) const noexcept { + return "z" + std::to_string( k ); + } + + template< size_t process_order, size_t problem_order > + std::string Grid< process_order, problem_order >::problemTileMode( const size_t k ) const noexcept { + return "t" + std::to_string( k ); + } + + template< size_t process_order, size_t problem_order > + std::string Grid< process_order, problem_order >::tileSize( const size_t k ) const noexcept { + return "tile_size" + std::to_string( k ); + } + +/* template< size_t process_order, size_t problem_order > + std::string Grid< process_order, problem_order >::chunkSize( const size_t k ) const noexcept { + return chunk_sizes[ k ]; + } +*/ + template< size_t process_order, size_t problem_order > + grb::RC Grid< process_order, problem_order >::forEach( const std::vector< int > axes, const std::function < void( void ) > func ) const { + + alp::internal::OpGen gen(); + + if( internal::OpGen::lastAxes.size() > 0 && internal::OpGen::lastAxes != axes ) { + alp::internal::ale.addPipeline(); + } + + if( internal::invalidForEachAxes( axes ) == true ) { + std::cerr << "The axes of a nested forEach cannot overlap with the axes of another forEach." << std::endl; + std::abort(); + } + + internal::OpGen::forEachAxes.push_back( axes ); + + // indicate the beginning of the forEach + internal::OpGen::forEachLevel++; + + // TODO: this is currently used only by the Tensor class in the getView method + // perhaps the getView should be a method of the Grid class +// internal::OpGen::parallelAxes = axes; + + // the current design assumes that each forEach is a new pipeline + // which is explicitly added here, later we need to figure out + // how we determine the creation of a pipeline +// alp::internal::ale.addPipeline( axes ); + + // TODO: emit for-loop intro + func(); + // TODO: emit for-loop outro + + // before leaving a forEach loop, any getView of an input Tensor + // should match with an implicit Stage for freeing any allocated memory + alp::internal::ale.insertFreeInputTensorStages( internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ) ); + + // indicate the end of the forEach + internal::OpGen::forEachLevel--; + + internal::OpGen::forEachAxes.pop_back(); + + internal::OpGen::lastAxes = axes; + + return grb::SUCCESS; + } +} + +#endif + diff --git a/include/graphblas/ascend/init.hpp b/include/graphblas/ascend/init.hpp new file mode 100644 index 000000000..e6d35829d --- /dev/null +++ b/include/graphblas/ascend/init.hpp @@ -0,0 +1,61 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Provides the initialisation and finalisation routines for the ascend + * backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_INIT +#define _H_GRB_ASCEND_INIT + +#include +#include + + +namespace grb { + + template<> + RC init< ascend >( const size_t, const size_t, void * const ); + + template<> + RC finalize< ascend >(); + + namespace internal { + + /** Internal state of the ascend backend. */ + class ASCEND { + + friend RC init< ascend >( const size_t, const size_t, void * const ); + + private: + + public: + + }; + + } + +} // namespace grb + +#endif //``end _H_GRB_ASCEND_INIT'' + diff --git a/include/graphblas/ascend/io.hpp b/include/graphblas/ascend/io.hpp new file mode 100644 index 000000000..9e2177874 --- /dev/null +++ b/include/graphblas/ascend/io.hpp @@ -0,0 +1,1353 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Provides the I/O primitives for the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_IO +#define _H_GRB_ASCEND_IO + +#include +#include +#include + +#include + +#include "boolean_dispatcher_io.hpp" + +#define NO_CAST_ASSERT( x, y, z ) \ + static_assert( x, \ + "\n\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* ERROR | " y " " z ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* Possible fix 1 | Remove no_casting from the template parameters " \ + "in this call to " y ".\n" \ + "* Possible fix 2 | Provide a value input iterator with element " \ + "types that match the output vector element type.\n" \ + "* Possible fix 3 | If applicable, provide an index input iterator " \ + "with element types that are integral.\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" ); + + +namespace grb { + + namespace internal { + + extern LazyEvaluation le; + + } + +} + +namespace grb { + + /** + * \defgroup IO Data Ingestion -- ascend backend + * @{ + */ + + template< typename InputType, typename RIT, typename CIT, typename NIT > + uintptr_t getID( const Matrix< InputType, ascend, RIT, CIT, NIT > &A ) { + return getID( internal::getRefMatrix( A ) ); + } + + template< typename DataType, typename Coords > + size_t size( const Vector< DataType, ascend, Coords > &x ) noexcept { + return internal::getCoordinates( x ).size(); + } + + template< typename InputType, typename RIT, typename CIT, typename NIT > + size_t nrows( + const Matrix< InputType, ascend, RIT, CIT, NIT > &A + ) noexcept { + return nrows( internal::getRefMatrix( A ) ); + } + + template< typename InputType, typename RIT, typename CIT, typename NIT > + size_t ncols( + const Matrix< InputType, ascend, RIT, CIT, NIT > &A + ) noexcept { + return ncols( internal::getRefMatrix( A ) ); + } + + template< typename DataType, typename Coords > + size_t nnz( const Vector< DataType, ascend, Coords > &x ) noexcept { + internal::le.execution( &x ); + return internal::getCoordinates( x ).nonzeroes(); + } + + template< typename InputType, typename RIT, typename CIT, typename NIT > + size_t nnz( + const Matrix< InputType, ascend, RIT, CIT, NIT > &A + ) noexcept { + return nnz( internal::getRefMatrix( A ) ); + } + + template< typename DataType, typename Coords > + size_t capacity( const Vector< DataType, ascend, Coords > &x ) noexcept { + return internal::getCoordinates( x ).size(); + } + + template< typename DataType, typename RIT, typename CIT, typename NIT > + size_t capacity( + const Matrix< DataType, ascend, RIT, CIT, NIT > &A + ) noexcept { + return capacity( internal::getRefMatrix( A ) ); + } + + template< typename DataType, typename Coords > + RC clear( Vector< DataType, ascend, Coords > &x ) noexcept { + internal::le.execution( &x ); + internal::getCoordinates( x ).clear(); + assert( false ); + return UNSUPPORTED; + } + + template< typename InputType, typename RIT, typename CIT, typename NIT > + RC clear( + Matrix< InputType, ascend, RIT, CIT, NIT > &A + ) noexcept { + return clear( internal::getRefMatrix( A ) ); + } + + template< + typename InputType, + typename Coords + > + RC resize( + Vector< InputType, ascend, Coords > &x, + const size_t new_nz + ) noexcept { + internal::le.execution( &x ); +#ifdef _DEBUG + std::cerr << "In grb::resize (vector, ascend)\n"; +#endif + // this cannot wait until after the below check, as the spec defines that + // anything is OK for an empty vector + if( new_nz == 0 ) { + return grb::clear( x ); + } + + // check if we have a mismatch + if( new_nz > grb::size( x ) ) { +#ifdef _DEBUG + std::cerr << "\t requested capacity of " << new_nz << ", " + << "expected a value smaller than or equal to " + << size( x ) << "\n"; +#endif + return ILLEGAL; + } + + // in the ascend implementation, vectors are of static size + // so this function immediately succeeds. However, all existing contents + // must be removed + return grb::clear( x ); + } + + template< + typename InputType, + typename RIT, + typename CIT, + typename NIT + > + RC resize( + Matrix< InputType, ascend, RIT, CIT, NIT > &A, + const size_t new_nz + ) noexcept { + return resize( internal::getRefMatrix( A ), new_nz ); + } + + template< + Descriptor descr = descriptors::no_operation, + typename DataType, + typename T, + typename Coords + > + RC set( + Vector< DataType, ascend, Coords > &x, + const T val, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< DataType >::value && + !grb::is_object< T >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< DataType, T >::value + ), "grb::set (Vector, unmasked)", + "called with a value type that does not match that of the given vector" + ); + + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + // pre-cast value to be copied + const DataType toCopy = static_cast< DataType >( val ); + DataType * const raw = internal::getRaw( x ); + const size_t n = internal::getCoordinates( x ).size(); + + constexpr const bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [&x, toCopy, raw] ( + internal::Pipeline &pipeline, + size_t lower_bound, size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage set(x, val) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + bool already_dense_output = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( x ) ); + if( !already_dense_output ) { +#endif + Coords local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + + local_x.local_assignAllNotAlreadyAssigned(); + assert( local_x.nonzeroes() == local_x.size() ); + + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + for( size_t i = lower_bound; i < upper_bound; i++ ) { + raw[ i ] = internal::template ValueOrIndex< + descr, DataType, DataType + >::getFromScalar( toCopy, i ); + } + + return SUCCESS; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::IO_SET_SCALAR, + n, sizeof( DataType ), dense_descr, true, + &x, nullptr, + &internal::getCoordinates( x ), nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: SET(x, val)" << std::endl; +#endif + return ret; + } + + namespace internal { + + template< + Descriptor descr, +#ifdef GRB_BOOLEAN_DISPATCHER + bool loop_over_vector_length, + bool already_dense_mask, + bool mask_is_dense, +#endif + typename DataType, + typename MaskType, + typename T, + typename Coords + > + RC masked_set( +#ifndef GRB_BOOLEAN_DISPATCHER + bool loop_over_vector_length, + bool already_dense_mask, + bool mask_is_dense, +#endif + const size_t lower_bound, + const size_t upper_bound, + Coords &local_x, + const Coords &local_mask, + Vector< DataType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + const T val + ) { + // pre-cast value to be copied + const DataType toCopy = static_cast< DataType >( val ); + + DataType * const raw = internal::getRaw( x ); + const MaskType * const m_p = internal::getRaw( m ); + +#ifdef _DEBUG + if( loop_over_vector_length ) { + std::cout << "\t using loop of size n (the vector length)\n"; + } else { + std::cout << "\t using loop of size nz (the number of nonzeroes in the " + << "vector)\n"; + } +#endif + + const size_t local_n = upper_bound - lower_bound; + const size_t local_mask_nz = already_dense_mask + ? local_n + : local_mask.nonzeroes(); + + const size_t local_size_n = loop_over_vector_length + ? local_x.size() + : local_mask_nz; + + for( size_t k = 0; k < local_size_n; ++k ) { + + const size_t index = ( ( loop_over_vector_length || already_dense_mask ) + ? k + : local_mask.index( k ) ) + lower_bound; + assert( index < internal::getCoordinates( x ).size() ); + if( already_dense_mask ) { + if( !internal::getCoordinates( m ).template mask< descr >( index, m_p ) ) { + continue; + } + } else { + if( !local_mask.template mask< descr >( + index - lower_bound, m_p + lower_bound + ) ) { + continue; + } + } + if( !mask_is_dense ) { + (void) local_x.assign( index - lower_bound ); + } + raw[ index ] = internal::ValueOrIndex< + descr, DataType, DataType + >::getFromScalar( toCopy, index ); + } + + assert( false ); + return UNSUPPORTED; + } + } + + template< + Descriptor descr = descriptors::no_operation, + typename DataType, + typename MaskType, + typename T, + typename Coords + > + RC set( + Vector< DataType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &m, + const T val, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< DataType >::value && !grb::is_object< T >::value, + void >::type * const = nullptr + ) { +#ifdef _DEBUG + std::cout << "In grb::set (vector-to-value, masked)\n"; +#endif + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< DataType, T >::value ), "grb::set (Vector to scalar, masked)", + "called with a value type that does not match that of the given " + "vector" + ); + + // catch empty mask + if( size( m ) == 0 ) { + return set< descr >( x, val, phase ); + } + + // dynamic sanity checks + const size_t sizex = size( x ); + if( sizex != size( m ) ) { + return MISMATCH; + } + + // handle trivial resize + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + constexpr const bool dense_mask = dense_descr && + (descr & descriptors::structural) && + !(descr & descriptors::invert_mask); + + // then source is a pattern vector, just copy its pattern + internal::Pipeline::stage_type func = [&x, &m, val] ( + internal::Pipeline &pipeline, + size_t lower_bound, size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage set(x, m, val) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + (void) pipeline; + + Coords local_mask, local_x; + const size_t local_n = upper_bound - lower_bound; + size_t local_x_nz = local_n; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_mask = true; + + const bool mask_is_dense = (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && already_dense_vectors; + + // for out-of-place operations with a mask and a scalar input, whether the + // output is dense or not depends on the mask + if( !mask_is_dense ) { + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( dense_descr && local_x_nz < local_n ) { + return ILLEGAL; + } + } + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( m ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( m ).asyncSubset( lower_bound, + upper_bound ); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( !mask_is_dense ) { + local_x.local_clear(); + if( lower_bound == 0 ) { + internal::getCoordinates( x ).reset_global_nnz_counter(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) ); +#endif + if( dense_descr ) { + pipeline.markMaybeSparseDenseDescriptorVerification( + &internal::getCoordinates( x ) ); + } + } + } + + const bool loop_over_vector_length = ( descr & descriptors::invert_mask ) || + ( 4 * local_mask.nonzeroes() > 3 * local_mask.size() ); + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_set< +#else + rc = internal::masked_set< +#endif + descr, DataType, MaskType, T, Coords + >( + loop_over_vector_length, + already_dense_mask, mask_is_dense, + lower_bound, upper_bound, + local_x, local_mask, x, m, val + ); + + if( !mask_is_dense ) { + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::IO_SET_MASKED_SCALAR, + sizex, sizeof( DataType ), + dense_descr, dense_mask, + &x, nullptr, &internal::getCoordinates( x ), nullptr, + &m, nullptr, nullptr, nullptr, + &internal::getCoordinates( m ), nullptr, nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: set(x, m, val)" << std::endl; +#endif + return ret; + } + + template< + Descriptor descr = descriptors::no_operation, + typename DataType, + typename T, + typename Coords + > + RC setElement( + Vector< DataType, ascend, Coords > &x, + const T val, + const size_t i, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< DataType >::value && + !grb::is_object< T >::value, void + >::type * const = nullptr + ) { + internal::le.execution( &x ); + + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< DataType, T >::value ), + "grb::set (Vector, at index)", + "called with a value type that does not match that of the given " + "vector" + ); + if( phase == RESIZE ) { + return SUCCESS; + } + assert( phase == EXECUTE ); + + // dynamic sanity checks + if( i >= size( x ) ) { + return MISMATCH; + } + if( (descr & descriptors::dense) && nnz( x ) < size( x ) ) { + return ILLEGAL; + } + + // do set + (void)internal::getCoordinates( x ).assign( i ); + internal::getRaw( x )[ i ] = static_cast< DataType >( val ); + +#ifdef _DEBUG + std::cout << "setElement (ascend) set index " << i << " to value " + << internal::getRaw( x )[ i ] << "\n"; +#endif + assert( false ); + return UNSUPPORTED; + } + + namespace internal { + + template< + Descriptor descr, + bool out_is_void, + bool in_is_void, + bool sparse, +#ifdef GRB_BOOLEAN_DISPATCHER + bool already_dense_vectors, + bool already_dense_input, +#endif + typename OutputType, + typename InputType, + typename Coords + > + RC set_generic( +#ifndef GRB_BOOLEAN_DISPATCHER + bool already_dense_vectors, + bool already_dense_input, +#endif + const size_t lower_bound, + const size_t upper_bound, + Coords &local_x, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &x, + const Vector< InputType, ascend, Coords > &y + ) { + const size_t local_n = upper_bound - lower_bound; + const size_t local_y_nz = already_dense_input + ? local_n + : local_y.nonzeroes(); + + OutputType * __restrict__ const dst = internal::getRaw( x ); + const InputType * __restrict__ const src = internal::getRaw( y ); + + if( sparse ) { + if( src == nullptr && dst == nullptr ) { + for( size_t i = 0; i < local_y_nz; ++i ) { + const size_t index = ( already_dense_input ) ? i : local_y.index( i ); + if( !already_dense_vectors ) { + (void) local_x.assign( index ); + } + } + } else { +#ifndef NDEBUG + if( src == nullptr ) { + assert( dst == nullptr ); + } +#endif + for( size_t i = 0; i < local_y_nz; ++i ) { + const size_t index = ( ( already_dense_input ) + ? i + : local_y.index( i ) ) + lower_bound; + if( !already_dense_vectors ) { + (void) local_x.assign( index - lower_bound ); + } + if( !out_is_void && !in_is_void ) { + dst[ index ] = internal::setIndexOrValue< descr, OutputType >( index, + src[ index ] ); + } + } + } + } else { + if( !( src == nullptr && dst == nullptr ) ) { +#ifndef NDEBUG + if( src == nullptr ) { + assert( dst == nullptr ); + } +#endif + for( size_t i = lower_bound; i < upper_bound; ++i ) { + if( !out_is_void && !in_is_void ) { + dst[ i ] = src[ i ]; + } + } + } + } + + assert( false ); + return UNSUPPORTED; + } + } + + template< + Descriptor descr = descriptors::no_operation, + typename OutputType, + typename InputType, + typename Coords + > + RC set( + Vector< OutputType, ascend, Coords > &x, + const Vector< InputType, ascend, Coords > &y, + const Phase &phase = EXECUTE + ) { + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< OutputType, InputType >::value ), + "grb::copy (Vector)", + "called with vector parameters whose element data types do not match" + ); + constexpr bool out_is_void = std::is_void< OutputType >::value; + constexpr bool in_is_void = std::is_void< OutputType >::value; + static_assert( !in_is_void || out_is_void, + "grb::set (ascend, vector <- vector, masked): " + "if input is void, then the output must be also" ); + static_assert( !(descr & descriptors::use_index) || !out_is_void, + "grb::set (ascend, vector <- vector, masked): " + "use_index descriptor cannot be set if output vector is void" ); + + //get length + const size_t n = internal::getCoordinates( y ).size(); + // check contract + if( n != size( x ) ) { + return MISMATCH; + } + if( n == 0 ) { + return SUCCESS; + } + if( getID( x ) == getID( y ) ) { + return ILLEGAL; + } + + // on resize + if( phase == RESIZE ) { + return SUCCESS; + } + + // on execute + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + constexpr bool dense_descr = descr & descriptors::dense; + + internal::Pipeline::stage_type func = [&x, &y] ( + internal::Pipeline &pipeline, + size_t lower_bound, size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage set(x, y) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_x, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_y_nz = local_n; + bool sparse = false; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + (void) pipeline; + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_input = true; + + if( !already_dense_vectors ) { + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_input = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input ) { +#else + already_dense_input = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); + if( local_y_nz < local_n ) { + sparse = true; + } +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( !already_dense_vectors ) { + if( lower_bound == 0 ) { + internal::getCoordinates( x ).reset_global_nnz_counter(); + } + } + + if( sparse ) { + // this primitive is out-of-place, thus make the output empty + if( !already_dense_vectors ) { + local_x.local_clear(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) ); +#endif + } + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_set_generic< +#else + rc = internal::set_generic< +#endif + descr, out_is_void, in_is_void, true + >( + already_dense_vectors, already_dense_input, + lower_bound, upper_bound, + local_x, local_y, x, y + ); + } else { + if( !already_dense_vectors ) { + local_x.local_assignAll(); + } + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_set_generic< +#else + rc = internal::set_generic< +#endif + descr, out_is_void, in_is_void, false + >( + already_dense_vectors, already_dense_input, + lower_bound, upper_bound, + local_x, local_y, x, y + ); + } + + if( !already_dense_vectors ) { + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::IO_SET_VECTOR, + n, sizeof( OutputType ), dense_descr, true, + getID( x ), + &x, nullptr, &internal::getCoordinates( x ), nullptr, + getID( y ), SIZE_MAX, SIZE_MAX, SIZE_MAX, + &y, nullptr, nullptr, nullptr, + &internal::getCoordinates( y ), nullptr, nullptr, nullptr, + SIZE_MAX, nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: set(x, y)" << std::endl; +#endif + return ret; + } + + namespace internal { + + template< + Descriptor descr, + bool out_is_void, + bool in_is_void, +#ifdef GRB_BOOLEAN_DISPATCHER + bool loop_over_y, + bool already_dense_input_y, + bool already_dense_mask, + bool mask_is_dense, +#endif + typename OutputType, + typename MaskType, + typename InputType, + typename Coords + > + RC masked_set( +#ifndef GRB_BOOLEAN_DISPATCHER + bool loop_over_y, + bool already_dense_input_y, + bool already_dense_mask, + bool mask_is_dense, +#endif + const size_t lower_bound, + const size_t upper_bound, + Coords &local_x, + const Coords &local_mask, + const Coords &local_y, + Vector< OutputType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &mask, + const Vector< InputType, ascend, Coords > &y + ) { + const size_t local_n = upper_bound - lower_bound; + const size_t local_y_nz = already_dense_input_y + ? local_n + : local_y.nonzeroes(); + const size_t local_mask_nz = already_dense_mask + ? local_n + : local_mask.nonzeroes(); + + const size_t n = loop_over_y ? local_y_nz : local_mask_nz; + + for( size_t k = 0; k < n; ++k ) { + const size_t i = ( loop_over_y + ? ( already_dense_input_y ? k : local_y.index( k ) ) + : ( already_dense_mask ? k : local_mask.index( k ) ) + ) + lower_bound; + if( already_dense_mask ) { + if( !internal::getCoordinates( mask ).template mask< descr >( + i, internal::getRaw( mask ) + ) ) { + continue; + } + } else { + if( !local_mask.template mask< descr >( + i - lower_bound, internal::getRaw( mask ) + lower_bound + ) ) { + continue; + } + } + if( loop_over_y || already_dense_input_y || + local_y.assigned( i - lower_bound ) + ) { + if( !out_is_void && !in_is_void ) { + if( !mask_is_dense ) { + (void) local_x.assign( i - lower_bound ); + } + internal::getRaw( x )[ i ] = internal::ValueOrIndex< + descr, OutputType, InputType + >::getFromArray( + internal::getRaw( y ), + [] (const size_t i) {return i;}, + i + ); + } + } + } + + assert( false ); + return UNSUPPORTED; + } + } + + template< + Descriptor descr = descriptors::no_operation, + typename OutputType, + typename MaskType, + typename InputType, + typename Coords + > + RC set( + Vector< OutputType, ascend, Coords > &x, + const Vector< MaskType, ascend, Coords > &mask, + const Vector< InputType, ascend, Coords > &y, + const Phase &phase = EXECUTE, + const typename std::enable_if< + !grb::is_object< OutputType >::value && + !grb::is_object< MaskType >::value && + !grb::is_object< InputType >::value, + void >::type * const = nullptr + ) { + // static sanity checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< OutputType, InputType >::value ), + "grb::set (Vector)", + "called with vector parameters whose element data types do not match" ); + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< MaskType, bool >::value ), + "grb::set (Vector)", + "called with non-bool mask element types" ); + constexpr bool out_is_void = std::is_void< OutputType >::value; + constexpr bool in_is_void = std::is_void< OutputType >::value; + static_assert( !in_is_void || out_is_void, + "grb::set (ascend, vector <- vector, masked): " + "if input is void, then the output must be also" ); + static_assert( !(descr & descriptors::use_index) || !out_is_void, + "grb::set (ascend, vector <- vector, masked): " + "use_index descriptor cannot be set if output vector is void" ); + + // catch contract violations + const size_t size = grb::size( y ); + if( size != grb::size( x ) ) { + return MISMATCH; + } + if( size == 0 ) { + return SUCCESS; + } + if( getID( x ) == getID( y ) ) { + return ILLEGAL; + } + + // delegate if possible + if( grb::size( mask ) == 0 ) { + return set( x, y ); + } + + // additional contract check + if( size != grb::size( mask ) ) { + return MISMATCH; + } + + // on resize + if( phase == RESIZE ) { + return SUCCESS; + } + + // on execute + assert( phase == EXECUTE ); + + RC ret = SUCCESS; + + constexpr const bool dense_descr = descr & descriptors::dense; + constexpr const bool dense_mask = dense_descr && + (descr & descriptors::structural) && + !(descr & descriptors::invert_mask); + + internal::Pipeline::stage_type func = [&x, &mask, &y] ( + internal::Pipeline &pipeline, + size_t lower_bound, size_t upper_bound + ) { +#ifdef _ASCEND_DEBUG + #pragma omp critical + std::cout << "\t\tExecution of stage set(x, mask, y) in the range(" + << lower_bound << ", " << upper_bound << ")" << std::endl; +#endif + RC rc = SUCCESS; + + Coords local_mask, local_x, local_y; + const size_t local_n = upper_bound - lower_bound; + size_t local_mask_nz = local_n; + size_t local_x_nz = local_n; + size_t local_y_nz = local_n; + +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + const bool already_dense_vectors = dense_descr || + pipeline.allAlreadyDenseVectors(); +#else + constexpr const bool already_dense_vectors = dense_descr; +#endif + + bool already_dense_mask = true; + bool already_dense_input_y = true; + + // make the vector empty unless the dense descriptor is provided + constexpr const bool mask_is_dense = (descr & descriptors::structural) && + !(descr & descriptors::invert_mask) && already_dense_vectors; + + if( !mask_is_dense ) { + local_x = internal::getCoordinates( x ).asyncSubset( lower_bound, + upper_bound ); + local_x_nz = local_x.nonzeroes(); + if( dense_descr && local_x_nz < local_n ) { + return ILLEGAL; + } + } + + if( !already_dense_vectors ) { +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + already_dense_mask = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( mask ) ); + if( !already_dense_mask ) { +#else + already_dense_mask = false; +#endif + local_mask = internal::getCoordinates( mask ).asyncSubset( lower_bound, + upper_bound ); + local_mask_nz = local_mask.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } + + already_dense_input_y = pipeline.containsAlreadyDenseVector( + &internal::getCoordinates( y ) ); + if( !already_dense_input_y ) { +#else + already_dense_input_y = false; +#endif + local_y = internal::getCoordinates( y ).asyncSubset( lower_bound, + upper_bound ); + local_y_nz = local_y.nonzeroes(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + } +#endif + } + + if( !mask_is_dense ) { + local_x.local_clear(); + if( lower_bound == 0 ) { + internal::getCoordinates( x ).reset_global_nnz_counter(); +#ifdef GRB_ALREADY_DENSE_OPTIMIZATION + pipeline.markMaybeSparseVector( &internal::getCoordinates( x ) ); +#endif + if( dense_descr ) { + pipeline.markMaybeSparseDenseDescriptorVerification( + &internal::getCoordinates( x ) ); + } + } + } + + // choose optimal loop size + const bool loop_over_y = (descr & descriptors::invert_mask) || + ( local_y_nz < local_mask_nz ); + +#ifdef GRB_BOOLEAN_DISPATCHER + rc = internal::boolean_dispatcher_masked_set< +#else + rc = internal::masked_set< +#endif + descr, out_is_void, in_is_void + >( + loop_over_y, + already_dense_input_y, already_dense_mask, mask_is_dense, + lower_bound, upper_bound, + local_x, local_mask, local_y, + x, mask, y + ); + + if( !mask_is_dense ) { + internal::getCoordinates( x ).asyncJoinSubset( local_x, lower_bound, + upper_bound ); + } + + return rc; + }; + + ret = ret ? ret : internal::le.addStage( + std::move( func ), + internal::Opcode::IO_SET_MASKED_VECTOR, + size, sizeof( OutputType ), dense_descr, dense_mask, + &x, nullptr, &internal::getCoordinates( x ), nullptr, + &mask, &y, nullptr, nullptr, + &internal::getCoordinates( mask ), &internal::getCoordinates( y ), + nullptr, nullptr, + nullptr + ); + +#ifdef _ASCEND_DEBUG + std::cout << "\t\tStage added to a pipeline: set(x, mask, y)" << std::endl; +#endif + return ret; + } + + namespace internal { + + template< + bool A_is_mask, + Descriptor descr, + typename OutputType, + typename InputType1, typename InputType2 = const OutputType, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2 + > + RC set( + Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A, + const InputType2 * __restrict__ id = nullptr + ) noexcept { + // ascend execution is not supported + // first, execute any computation that is not completed + grb::internal::le.execution(); + + // second, delegate to the reference backend + return set< A_is_mask, descr, OutputType, InputType1, InputType2 >( + internal::getRefMatrix( C ), internal::getRefMatrix( A ), id ); + } + + } // end namespace internal::grb + + template< + Descriptor descr = descriptors::no_operation, + typename OutputType, typename InputType, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2 + > + RC set( + Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType, ascend, RIT2, CIT2, NIT2 > &A, + const Phase &phase = EXECUTE + ) noexcept { + static_assert( std::is_same< OutputType, void >::value || + !std::is_same< InputType, void >::value, + "grb::set cannot interpret an input pattern matrix without a " + "semiring or a monoid. This interpretation is needed for " + "writing the non-pattern matrix output. Possible solutions: 1) " + "use a (monoid-based) foldl / foldr, 2) use a masked set, or " + "3) change the output of grb::set to a pattern matrix also." ); +#ifdef _DEBUG + std::cout << "Called grb::set (matrix-to-matrix, ascend)" << std::endl; +#endif + // static checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType, OutputType >::value + ), "grb::set", + "called with non-matching value types" ); + + // dynamic checks + assert( phase != TRY ); + + // delegate + if( phase == RESIZE ) { + return resize( C, nnz( A ) ); + } else { + assert( phase == EXECUTE ); + return internal::set< false, descr >( C, A ); + } + } + + template< + Descriptor descr = descriptors::no_operation, + typename OutputType, typename InputType1, typename InputType2, + typename RIT1, typename CIT1, typename NIT1, + typename RIT2, typename CIT2, typename NIT2 + > + RC set( + Matrix< OutputType, ascend, RIT1, CIT1, NIT1 > &C, + const Matrix< InputType1, ascend, RIT2, CIT2, NIT2 > &A, + const InputType2 &val, + const Phase &phase = EXECUTE + ) noexcept { + static_assert( !std::is_same< OutputType, void >::value, + "internal::grb::set (masked set to value): cannot have a pattern " + "matrix as output" ); +#ifdef _DEBUG + std::cout << "Called grb::set (matrix-to-value-masked, ascend)\n"; +#endif + // static checks + NO_CAST_ASSERT( ( !(descr & descriptors::no_casting) || + std::is_same< InputType2, OutputType >::value + ), "grb::set", + "called with non-matching value types" + ); + + // dynamic checks + assert( phase != TRY ); + + // delegate + if( phase == RESIZE ) { + return resize( C, nnz( A ) ); + } else { + assert( phase == EXECUTE ); + if( std::is_same< OutputType, void >::value ) { + return internal::set< false, descr >( C, A ); + } else { + return internal::set< true, descr >( C, A, &val ); + } + } + } + + template< + Descriptor descr = descriptors::no_operation, + typename InputType, + typename fwd_iterator, + typename Coords, + class Dup = operators::right_assign< InputType > + > + RC buildVector( + Vector< InputType, ascend, Coords > &x, + fwd_iterator start, + const fwd_iterator end, + const IOMode mode, + const Dup &dup = Dup() + ) { + return buildVector< descr, InputType, fwd_iterator, Coords, Dup >( + internal::getRefVector( x ), start, end, mode, dup ); + } + + template< + Descriptor descr = descriptors::no_operation, + typename InputType, + typename fwd_iterator1, + typename fwd_iterator2, + typename Coords, + class Dup = operators::right_assign< InputType > + > + RC buildVector( + Vector< InputType, ascend, Coords > &x, + fwd_iterator1 ind_start, + const fwd_iterator1 ind_end, + fwd_iterator2 val_start, + const fwd_iterator2 val_end, + const IOMode mode, + const Dup &dup = Dup() + ) { + internal::le.execution( &x ); + return buildVector< + descr, InputType, fwd_iterator1, fwd_iterator2, Coords, Dup + >( + internal::getRefVector( x ), ind_start, ind_end, val_start, val_end, + mode, dup + ); + } + + /** buildMatrixUnique is based on that of the reference backend */ + template< + Descriptor descr = descriptors::no_operation, + typename InputType, + typename RIT, + typename CIT, + typename NIT, + typename fwd_iterator + > + RC buildMatrixUnique( + Matrix< InputType, ascend, RIT, CIT, NIT > &A, + fwd_iterator start, + const fwd_iterator end, + const IOMode mode + ) { + return buildMatrixUnique< + descr, InputType, RIT, CIT, NIT, fwd_iterator + >( internal::getRefMatrix(A), start, end, mode ); + } + + template< + typename InputType, + typename Coords + > + uintptr_t getID( const Vector< InputType, ascend, Coords > &x ) { + return getID( internal::getRefVector( x ) ); + } + + template<> + RC wait< ascend >(); + + /** \internal Dispatch to base wait implementation */ + template< + typename InputType, + typename Coords, + typename ... Args + > + RC wait( + const Vector< InputType, ascend, Coords > &x, + const Args &... args + ) { + RC ret = internal::le.execution( &x ); + if( ret != SUCCESS ) { + return ret; + } + return wait( args... ); + } + + template< + typename InputType, + typename Coords + > + RC wait( const Vector< InputType, ascend, Coords > &x ) { + return internal::le.execution( &x ); + } + + /** \internal Dispatch to base wait implementation */ + template< + typename InputType, + typename RIT, typename CIT, typename NIT, + typename... Args + > + RC wait( + const Matrix< InputType, ascend, RIT, CIT, NIT > &A, + const Args &... args + ) { + (void) A; + //TODO: currently, matrices are read only and no action is required + // once the level-3 primitives are implemented + // the pipeline should be executed like for vectors + return wait( args... ); + } + + template< typename InputType, typename RIT, typename CIT, typename NIT > + RC wait( const Matrix< InputType, ascend > &A ) { + (void) A; + //TODO: currently, matrices are read only and no action is required + // once the level-3 primitives are implemented + // the pipeline should be executed like for vectors + //return wait( args... ); + assert( false ); + return UNSUPPORTED; + } + + /** @} */ + +} // namespace grb + +#undef NO_CAST_ASSERT + +#endif // end ``_H_GRB_ASCEND_IO + diff --git a/include/graphblas/ascend/lazy_evaluation.hpp b/include/graphblas/ascend/lazy_evaluation.hpp new file mode 100644 index 000000000..683129b16 --- /dev/null +++ b/include/graphblas/ascend/lazy_evaluation.hpp @@ -0,0 +1,80 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _H_ALP_ASCEND_LAZY_EVALUATION +#define _H_ALP_ASCEND_LAZY_EVALUATION + +/** + * To enable debugging information only for the ascend backend, the code + * should be combiled with the _ASCEND_DEBUG definition, without defining + * _DEBUG. If the code is compiled with _DEBUG, the debugging information for + * the ascend backend is enabled as well. + */ +#if !defined(_ASCEND_DEBUG) && defined(_DEBUG) + #define _ASCEND_DEBUG +#endif + +#include + +#include +#include +#include + +namespace alp { + + namespace internal { + + class Stage; + + /** + * Encodes a single pipeline that may be expanded, merged, or executed. + */ + class AscendLazyEvaluation { + + private: + + size_t num_pipelines; + std::vector< alp::internal::AscendPipeline > pipelines; + + + public: + + AscendLazyEvaluation(); + void addPipeline(); + void insertFreeInputTensorStages( const std::vector< int > &forEachAxes ); + const Tensor &store( const Tensor &output_tensor ); + void clear(); + void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const double alpha, const std::vector< int > &forEachAxes ); + void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ); + void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ); + void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const Tensor &tensor3, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ); +// void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const Tensor &tensor3, const Tensor &tensor4, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ); + void generateDeclarations( std::stringstream &declarations ); +// void generateConstructor( std::stringstream &constructor ); + void generateHostBody( std::stringstream &os, std::stringstream &analyticModelArgs, std::stringstream &analyticModelFormalParams, std::stringstream &analyticModelDecls, std::stringstream &analyticModelConstrBody ); + void generateInit( std::stringstream &init ); + void generateProcess( std::stringstream &process, std::stringstream &processCall ); + + void debug_print() const; + }; + + } + +} + +#endif //end `_H_ALP_ASCEND_LAZY_EVALUATION' + diff --git a/include/graphblas/ascend/matrix.hpp b/include/graphblas/ascend/matrix.hpp new file mode 100644 index 000000000..d224ce91b --- /dev/null +++ b/include/graphblas/ascend/matrix.hpp @@ -0,0 +1,602 @@ +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Provides the Ascend matrix container. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_MATRIX +#define _H_GRB_ASCEND_MATRIX + +#include //std::stringstream +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "forward.hpp" + + +namespace grb { + + namespace internal { + + template< typename DataType, typename RIT, typename CIT, typename NIT > + Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix( + Matrix< DataType, ascend, RIT, CIT, NIT > &A ) noexcept; + + template< typename DataType, typename RIT, typename CIT, typename NIT > + const Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix( + const Matrix< DataType, ascend, RIT, CIT, NIT > &A ) noexcept; + + template< typename D, typename RIT, typename CIT, typename NIT > + const size_t & getNonzeroCapacity( + const grb::Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept { + return A.cap; + } + + template< typename D, typename RIT, typename CIT, typename NIT > + const size_t & getCurrentNonzeroes( + const grb::Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept { + return A.nz; + } + + template< typename D, typename RIT, typename CIT, typename NIT > + void setCurrentNonzeroes( + grb::Matrix< D, ascend, RIT, CIT, NIT > &A, + const size_t nnz + ) noexcept { + A.nz = nnz; + } + + /** + * \internal + * + * Retrieves internal SPA buffers. + * + * @param[out] coorArr Pointer to the bitmask array + * @param[out] coorBuf Pointer to the stack + * @param[out] valBuf Pointer to the value buffer + * @param[in] k If 0, the row-wise SPA is returned + * If 1, the column-wise SPA is returned + * Any other value is not allowed + * @param[in] A The matrix of which to return the associated SPA + * data structures. + * + * @tparam InputType The type of the value buffer. + * + * \endinternal + */ + template< typename InputType, typename RIT, typename CIT, typename NIT > + void getMatrixBuffers( + char * &coorArr, char * &coorBuf, InputType * &valbuf, + const unsigned int k, + const grb::Matrix< InputType, ascend, RIT, CIT, NIT > &A + ) noexcept { + assert( k < 2 ); + coorArr = const_cast< char * >( A.coorArr[ k ] ); + coorBuf = const_cast< char * >( A.coorBuf[ k ] ); + valbuf = const_cast< InputType * >( A.valbuf[ k ] ); + } + + template< Descriptor descr, + bool input_dense, bool output_dense, + bool masked, + bool left_handed, + template< typename > class One, + typename IOType, + class AdditiveMonoid, class Multiplication, + typename InputType1, typename InputType2, typename InputType3, + typename RowColType, typename NonzeroType, + typename Coords + > + void vxm_inner_kernel_scatter( + RC &rc, + Vector< IOType, ascend, Coords > &destination_vector, + IOType * __restrict__ const &destination, + const size_t &destination_range, + const Vector< InputType1, ascend, Coords > &source_vector, + const InputType1 * __restrict__ const &source, + const size_t &source_index, + const internal::Compressed_Storage< + InputType2, RowColType, NonzeroType + > &matrix, + const Vector< InputType3, ascend, Coords > &mask_vector, + const InputType3 * __restrict__ const &mask, + const AdditiveMonoid &add, + const Multiplication &mul, + const std::function< size_t( size_t ) > &src_local_to_global, + const std::function< size_t( size_t ) > &dst_global_to_local + ); + + template< + Descriptor descr, + bool masked, bool input_masked, bool left_handed, + template< typename > class One, + class AdditiveMonoid, class Multiplication, + typename IOType, typename InputType1, typename InputType2, + typename InputType3, typename InputType4, + typename Coords, typename RIT, typename CIT, typename NIT + > + RC vxm_generic( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &v, + const Vector< InputType4, ascend, Coords > &v_mask, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const AdditiveMonoid &add, + const Multiplication &mul, + const std::function< size_t( size_t ) > &row_l2g, + const std::function< size_t( size_t ) > &row_g2l, + const std::function< size_t( size_t ) > &col_l2g, + const std::function< size_t( size_t ) > &col_g2l + ); + + } // namespace internal + + template< typename DataType, typename RIT, typename CIT, typename NIT > + size_t nrows( + const Matrix< DataType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename DataType, typename RIT, typename CIT, typename NIT > + size_t ncols( + const Matrix< DataType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename DataType, typename RIT, typename CIT, typename NIT > + size_t nnz( + const Matrix< DataType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename InputType, typename RIT, typename CIT, typename NIT > + RC clear( Matrix< InputType, ascend, RIT, CIT, NIT > & ) noexcept; + + template< typename DataType, typename RIT, typename CIT, typename NIT > + RC resize( + Matrix< DataType, ascend, RIT, CIT, NIT > &, + const size_t + ) noexcept; + + template< + class ActiveDistribution, typename Func, typename DataType, + typename RIT, typename CIT, typename NIT + > + RC eWiseLambda( + const Func f, + const Matrix< DataType, ascend, RIT, CIT, NIT > &A, + const size_t s, const size_t P + ); + + /** + * A GraphBLAS matrix, ascend implementation. + * + * Uses Compressed Column Storage (CCS) plus Compressed Row Storage (CRS). + * + * \warning This implementation prefers speed over memory efficiency. + * + * @tparam D The type of a nonzero element. + * + * \internal + * @tparam RowIndexType The type used for row indices + * @tparam ColIndexType The type used for column indices + * @tparam NonzeroIndexType The type used for nonzero indices + * \endinternal + */ + template< + typename D, + typename RowIndexType, + typename ColIndexType, + typename NonzeroIndexType + > + class Matrix< D, ascend, RowIndexType, ColIndexType, NonzeroIndexType > { + + static_assert( !grb::is_object< D >::value, + "Cannot create an ALP matrix of ALP objects!" ); + + template< typename DataType, typename RIT, typename CIT, typename NIT > + friend Matrix< DataType, reference, RIT, CIT, NIT > & internal::getRefMatrix( + Matrix< DataType, ascend, RIT, CIT, NIT > &A + ) noexcept; + + template< typename DataType, typename RIT, typename CIT, typename NIT > + friend const Matrix< DataType, reference, RIT, CIT, NIT > & + internal::getRefMatrix( + const Matrix< DataType, ascend, RIT, CIT, NIT > &A + ) noexcept; + + + /* ********************* + BLAS2 friends + ********************* */ + + template< typename DataType, typename RIT, typename CIT, typename NIT > + friend size_t nrows( + const Matrix< DataType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename DataType, typename RIT, typename CIT, typename NIT > + friend size_t ncols( + const Matrix< DataType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename DataType, typename RIT, typename CIT, typename NIT > + friend size_t nnz( + const Matrix< DataType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename InputType, typename RIT, typename CIT, typename NIT > + friend RC clear( + Matrix< InputType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename DataType, typename RIT, typename CIT, typename NIT > + friend RC resize( + Matrix< DataType, ascend, RIT, CIT, NIT > &, + const size_t + ) noexcept; + + template< + class ActiveDistribution, typename Func, typename DataType, + typename RIT, typename CIT, typename NIT + > + friend RC eWiseLambda( + const Func, + const Matrix< DataType, ascend, RIT, CIT, NIT > &, + const size_t, const size_t + ); + + template< + Descriptor descr, + bool input_dense, bool output_dense, bool masked, bool left_handed, + template< typename > class One, + typename IOType, + class AdditiveMonoid, class Multiplication, + typename InputType1, typename InputType2, + typename InputType3, + typename RowColType, typename NonzeroType, + typename Coords + > + friend void internal::vxm_inner_kernel_scatter( + RC &rc, + Vector< IOType, ascend, Coords > &destination_vector, + IOType * __restrict__ const &destination, + const size_t &destination_range, + const Vector< InputType1, ascend, Coords > &source_vector, + const InputType1 * __restrict__ const &source, + const size_t &source_index, + const internal::Compressed_Storage< + InputType2, RowColType, NonzeroType + > &matrix, + const Vector< InputType3, ascend, Coords > &mask_vector, + const InputType3 * __restrict__ const &mask, + const AdditiveMonoid &add, + const Multiplication &mul, + const std::function< size_t( size_t ) > &src_local_to_global, + const std::function< size_t( size_t ) > &dst_global_to_local + ); + + template< + Descriptor descr, + bool masked, bool input_masked, bool left_handed, + template< typename > class One, + class AdditiveMonoid, class Multiplication, + typename IOType, typename InputType1, typename InputType2, + typename InputType3, typename InputType4, + typename Coords, typename RIT, typename CIT, typename NIT + > + friend RC internal::vxm_generic( + Vector< IOType, ascend, Coords > &u, + const Vector< InputType3, ascend, Coords > &mask, + const Vector< InputType1, ascend, Coords > &v, + const Vector< InputType4, ascend, Coords > &v_mask, + const Matrix< InputType2, ascend, RIT, CIT, NIT > &A, + const AdditiveMonoid &add, + const Multiplication &mul, + const std::function< size_t( size_t ) > &row_l2g, + const std::function< size_t( size_t ) > &row_g2l, + const std::function< size_t( size_t ) > &col_l2g, + const std::function< size_t( size_t ) > &col_g2l + ); + + /* ******************** + IO friends + ******************** */ + + template< + Descriptor descr, typename InputType, + typename RIT, typename CIT, typename NIT, + typename fwd_iterator + > + friend RC buildMatrixUnique( + Matrix< InputType, ascend, RIT, CIT, NIT > &, + fwd_iterator, const fwd_iterator, + const IOMode + ); + + friend internal::Compressed_Storage< D, RowIndexType, NonzeroIndexType > & + internal::getCRS<>( + Matrix< + D, ascend, + RowIndexType, ColIndexType, NonzeroIndexType + > &A + ) noexcept; + + friend const internal::Compressed_Storage< + D, + RowIndexType, NonzeroIndexType + > & internal::getCRS<>( + const Matrix< + D, ascend, + RowIndexType, ColIndexType, NonzeroIndexType + > &A + ) noexcept; + + friend internal::Compressed_Storage< D, ColIndexType, NonzeroIndexType > & + internal::getCCS<>( + Matrix< + D, ascend, + RowIndexType, ColIndexType, NonzeroIndexType + > &A + ) noexcept; + + friend const internal::Compressed_Storage< + D, ColIndexType, NonzeroIndexType + > & internal::getCCS<>( + const Matrix< + D, ascend, + RowIndexType, ColIndexType, NonzeroIndexType + > &A + ) noexcept; + + template< typename InputType, typename RIT, typename CIT, typename NIT > + friend const size_t & internal::getNonzeroCapacity( + const grb::Matrix< InputType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename InputType, typename RIT, typename CIT, typename NIT > + friend const size_t & internal::getCurrentNonzeroes( + const grb::Matrix< InputType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename InputType, typename RIT, typename CIT, typename NIT > + friend void internal::setCurrentNonzeroes( + grb::Matrix< InputType, ascend, RIT, CIT, NIT > &, const size_t + ) noexcept; + + template< typename InputType, typename RIT, typename CIT, typename NIT > + friend void internal::getMatrixBuffers( + char *&, char *&, InputType *&, + const unsigned int, + const grb::Matrix< InputType, ascend, RIT, CIT, NIT > & + ) noexcept; + + template< typename InputType, typename RIT, typename CIT, typename NIT > + friend uintptr_t getID( + const Matrix< InputType, ascend, RIT, CIT, NIT > & + ); + + + private: + + Matrix< D, reference, RowIndexType, ColIndexType, NonzeroIndexType > ref; + + /** Our own type. */ + typedef Matrix< + D, ascend, + RowIndexType, ColIndexType, NonzeroIndexType + > self_type; + + Matrix() : ref( ) + {} + + Matrix( + const D *__restrict__ const _values, + const ColIndexType *__restrict__ const _column_indices, + const NonzeroIndexType *__restrict__ const _offset_array, + const size_t _m, const size_t _n, + const size_t _cap, + char *__restrict__ const buf1 = nullptr, + char *__restrict__ const buf2 = nullptr, + D *__restrict__ const buf3 = nullptr + ) : ref( + _values, _column_indices, _offset_array, + _m, _n, _cap, + buf1, buf2, buf3 + ) {} + + void moveFromOther( self_type &&other ) { + ref.moveFromOther( std::move( other.ref ) ); + } + + RC clear() { + return ref.clear(); + } + + RC resize( const size_t nonzeroes ) { + return ref.resize( nonzeroes ); + } + + template< + Descriptor descr = descriptors::no_operation, + typename fwd_iterator + > + RC buildMatrixUnique( + const fwd_iterator &_start, + const fwd_iterator &_end + ) { + + return ref.buildMatrixUnique( _start, _end ); + } + + + public: + + /** @see Matrix::value_type */ + typedef D value_type; + + /** The iterator type over matrices of this type. */ + typedef typename internal::Compressed_Storage< + D, RowIndexType, NonzeroIndexType + >::template ConstIterator< + internal::Distribution< reference > + > const_iterator; + + Matrix( + const size_t rows, const size_t columns, const size_t nz + ) : ref( rows, columns, nz ) + {} + + Matrix( const size_t rows, const size_t columns ) : ref( rows, columns ) + {} + + /** + * \internal + * \todo See below code comment + * \endinternal + */ + Matrix( + const Matrix< + D, ascend, RowIndexType, ColIndexType, NonzeroIndexType + > &other ) : ref( other.ref ) + { + //TODO: the pipeline should be executed once level-3 primitives are + // implemented. In the current implementation matrices may be used only + // as the input of SpMV + } + + Matrix( self_type &&other ) noexcept : ref( std::move( other.ref ) ) { + //TODO: the pipeline should be executed once level-3 primitives are + // implemented. In the current implementation matrices may be used only + // as the input of SpMV + } + + self_type& operator=( self_type &&other ) noexcept { + ref = std::move( other.ref ); + return *this; + } + + ~Matrix() { + // the pipeline is executed before memory deallocation + internal::le.execution( this ); + } + + template< class ActiveDistribution = internal::Distribution< reference > > + typename internal::Compressed_Storage< + D, RowIndexType, NonzeroIndexType + >::template ConstIterator< ActiveDistribution > begin( + const IOMode mode = PARALLEL, + const size_t s = 0, const size_t P = 1 + ) const { + return ref.begin( mode, s, P ); + } + + template< class ActiveDistribution = internal::Distribution< reference > > + typename internal::Compressed_Storage< + D, + RowIndexType, + NonzeroIndexType + >::template ConstIterator< ActiveDistribution > end( + const IOMode mode = PARALLEL, + const size_t s = 0, const size_t P = 1 + ) const { + return ref.end( mode, s, P ); + } + + template< class ActiveDistribution = internal::Distribution< reference > > + typename internal::Compressed_Storage< + D, + RowIndexType, + NonzeroIndexType + >::template ConstIterator< ActiveDistribution > cbegin( + const IOMode mode = PARALLEL + ) const { + return ref.cbegin( mode ); + } + + template< class ActiveDistribution = internal::Distribution< reference > > + typename internal::Compressed_Storage< + D, + RowIndexType, + NonzeroIndexType + >::template ConstIterator< ActiveDistribution > cend( + const IOMode mode = PARALLEL + ) const { + return ref.cend( mode ); + } + + }; + + // template specialisation for GraphBLAS type traits + template< typename D, typename RIT, typename CIT, typename NIT > + struct is_container< Matrix< D, ascend, RIT, CIT, NIT > > { + /** A ascend Matrix is a GraphBLAS object. */ + static const constexpr bool value = true; + }; + + //internal getters implementation + namespace internal { + + template< typename DataType, typename RIT, typename CIT, typename NIT > + inline Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix( + Matrix< DataType, ascend, RIT, CIT, NIT > &A + ) noexcept { + return (A.ref); + } + + template< typename DataType, typename RIT, typename CIT, typename NIT > + inline const Matrix< DataType, reference, RIT, CIT, NIT >& getRefMatrix( + const Matrix< DataType, ascend, RIT, CIT, NIT > &A + ) noexcept { + return (A.ref); + } + + } //end ``grb::internal'' namespace + +} // namespace grb + +#endif // end ``_H_GRB_ASCEND_MATRIX'' + diff --git a/include/graphblas/ascend/operators.hpp b/include/graphblas/ascend/operators.hpp new file mode 100644 index 000000000..1a495dba2 --- /dev/null +++ b/include/graphblas/ascend/operators.hpp @@ -0,0 +1,157 @@ +#ifndef _H_ALP_ASCEND_OPERATORS +#define _H_ALP_ASCEND_OPERATORS + +#include + +#include + +//#include "graphblas/ascend/grid.hpp" +#include "graphblas/ascend/utils.hpp" + + +namespace alp +{ + class Tensor; + + + Tensor getView( const Tensor &parent ); + + // TODO extend to multiple containers + void store( const Tensor &output ); + + void set( + Tensor &tout, + Tensor &tin, + const std::vector< int > &activeAxes = std::vector< int >() + ); + + void set( + Tensor &tout, + double alpha // TODO: this is hardcoded datatype + ); + + void apply( + Tensor &tout, + Tensor &tin, + const std::string &opName, + const std::vector< int > &activeAxes = std::vector< int >() + ); + + void apply( + Tensor &tout, + Tensor &tin1, + Tensor &tin2, + const std::string &opName, + const std::vector< int > &activeAxes = std::vector< int >() + ); + + void foldl( + Tensor &tinout, + Tensor &tin, + const std::string &opName, + const std::vector< int > &activeAxes = std::vector< int >() + ); + +// template< size_t sm, size_t pm > + void foldl( +// const Grid< sm, pm > &grid, + Tensor &tinout, + const std::string &opName, + const std::vector< int > &activeAxes = std::vector< int >() + ); + + + struct ReductionOperation { + Tensor &input; + const std::vector< int > axes; + const internal::Stagetype opType; + const std::string opName; + + ReductionOperation( + Tensor &input, + const std::vector< int > &axes, + const internal::Stagetype &op, + const std::string &opName + ) : + input( input ), + axes( axes ), + opType( op ), + opName( opName ) {} + + }; + + /** + * Max-reduce operator + */ + template< typename AxisType > + ReductionOperation max( Tensor &z, const AxisType axis ) { + static_assert( + std::is_convertible< AxisType, int >::value || std::is_convertible< AxisType, std::string >::value, + "AxisType must be convertible to int or std::string" + ); + const int axisId = getAxisId( axis ); + return { z, { axisId }, internal::Stagetype::FOLDL_MAX, "max" }; + } + + /** + * Add-reduce operator + */ + template< typename AxisType > + ReductionOperation add( Tensor &z, const AxisType axis ) { + static_assert( + std::is_convertible< AxisType, int >::value || std::is_convertible< AxisType, std::string >::value, + "AxisType must be convertible to int or std::string" + ); + const int axisId = getAxisId( axis ); + return { z, { axisId }, internal::Stagetype::FOLDL_ADD, "add" }; + } + + struct ApplyOperation { + Tensor& input1; + Tensor& input2; + const std::vector< int > axes; + const std::string opName; + + ApplyOperation( + Tensor &input1, Tensor &input2, + const std::vector< int > &axes, + const std::string &opName + ) : + input1( input1 ), + input2( input2 ), + axes( axes ), + opName( opName ) { } + }; + + /** + * Add-reduce operator + */ + template< typename AxisType > + ApplyOperation add( Tensor &y, Tensor &z, const AxisType axis ) { + static_assert( + std::is_convertible< AxisType, int >::value || std::is_convertible< AxisType, std::string >::value, + "AxisType must be convertible to int or std::string" + ); + const int axisId = getAxisId( axis ); + // std::vector inputs = { y, z }; + return { y, z, { axisId }, "add" }; + } + + + + /** + * Minus operator + */ + template< typename AxisType > + ApplyOperation minus( Tensor &y, Tensor &z, const AxisType axis ) { + static_assert( + std::is_convertible< AxisType, int >::value || std::is_convertible< AxisType, std::string >::value, + "AxisType must be convertible to int or std::string" + ); + const int axisId = getAxisId( axis ); + return { y, z, { axisId }, "minus" }; + } + +} // namespace alp + +#endif diff --git a/include/graphblas/ascend/opgen.hpp b/include/graphblas/ascend/opgen.hpp new file mode 100644 index 000000000..36950cdaa --- /dev/null +++ b/include/graphblas/ascend/opgen.hpp @@ -0,0 +1,100 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Class that defines the state of the code generation. + * + * @author A. N. Yzelman. + * @date 12th of September, 2023. + */ + + +#ifndef _H_ALP_ASCEND_OPGEN +#define _H_ALP_ASCEND_OPGEN + +#include +#include +#include +#include +#include + +#include + +#include + +namespace alp { + + namespace internal { + + class OpGen { + + public: + + OpGen() = default; + virtual ~OpGen() = default; + + /** Returns a string representation of a given type. */ +/* template + static std::string type_name(){ + int info = 0; + return abi::__cxa_demangle( typeid(T).name(), NULL, NULL, &info ); + } +*/ + /** + * Maintains a mapping from chunks to their sizes. + * + * \warning The map does not guarantee that chunks who have since been + * destructed will no longer appear in the map. + */ +//TODO how is this supposed to be used? +// static std::map< std::string, std::string > chunkSize; + + static std::string kernel_id; + + /** Indicates if the executed code is within the lambda function of a forEach */ + static size_t forEachLevel; + + static std::vector< std::vector< int > > forEachAxes; + static std::vector< int > lastAxes; + + static std::stringstream aux_func; + static std::stringstream analyticModelFormalParams; + static std::stringstream hostFormalParam; + static std::stringstream hostBody; + static std::stringstream hostArg; + static std::stringstream constrBody; + static std::stringstream classMembers; + static std::stringstream initBody; + static std::stringstream genericProcessBody; + static std::stringstream declarations; + + static std::vector< std::stringstream > processFunc; + static std::vector< std::stringstream > computeFunc; + static std::vector< std::stringstream > copyinFunc; + static std::vector< std::stringstream > copyoutFunc; + + static void compileClear(); + static void generate( std::ostream &os ); + }; + } + +} + +#endif // end _H_ALP_ASCEND_OPGEN + diff --git a/include/graphblas/ascend/pinnedvector.hpp b/include/graphblas/ascend/pinnedvector.hpp new file mode 100644 index 000000000..3991e8824 --- /dev/null +++ b/include/graphblas/ascend/pinnedvector.hpp @@ -0,0 +1,164 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * PinnedVector implementation of the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_PINNEDVECTOR +#define _H_GRB_ASCEND_PINNEDVECTOR + +#include +#include + +#include + +#include "coordinates.hpp" +#include "vector.hpp" + + +namespace grb { + + namespace internal { + + extern LazyEvaluation le; + + } + + /** + * The PinnedVector class is based on that of the reference backend. + * + * \internal There is some code duplication with the reference PinnedVector. + * At present, it is unclear if this can be reduced. + */ + template< typename IOType > + class PinnedVector< IOType, ascend > { + + private: + + /** Essentially a shared pointer into the nonzero values */ + utils::AutoDeleter< IOType > _raw_deleter; + + /** Essentially a shared pointer into the SPA's stack. */ + utils::AutoDeleter< char > _stack_deleter; + + /** The shared nonzero values */ + IOType * _buffered_values; + + /** + * The shared coordinates, on which only stack-based accesses are performed. + */ + internal::Coordinates< + config::IMPLEMENTATION< ascend >::coordinatesBackend() + > _buffered_coordinates; + + + public: + + /** Constructs an empty pinned vector. */ + PinnedVector() : _buffered_values( nullptr ) {} + + /** Constructs a pinning of \a x */ + PinnedVector( + const Vector< IOType, ascend, internal::Coordinates< + config::IMPLEMENTATION< ascend >::coordinatesBackend() + > > &x, + const IOMode mode + ) { + // The execution of a pipeline that uses the vector is necessary. + if( internal::getCoordinates(x).size() > 0 ) { + internal::le.execution( &x ); + } + + _raw_deleter = internal::getRefVector(x)._raw_deleter; + _stack_deleter = internal::getRefVector(x)._buffer_deleter; + _buffered_values = internal::getRefVector(x)._raw; + _buffered_coordinates = internal::getRefVector(x)._coordinates; + + // The ascend backend is always single process, so the mode is unused. + (void) mode; + } + + /** \internal No implementation details */ + inline size_t size() const noexcept { +#ifndef NDEBUG + if( _buffered_coordinates.size() == 0 ) { + assert( _buffered_values == nullptr ); + } +#endif + return _buffered_coordinates.size(); + } + + /** \internal No implementation details */ + inline size_t nonzeroes() const noexcept { +#ifndef NDEBUG + if( _buffered_coordinates.size() == 0 ) { + assert( _buffered_values == nullptr ); + } +#endif + return _buffered_coordinates.nonzeroes(); + } + + /** \internal No implementation details */ + template< typename OutputType = IOType > + inline OutputType getNonzeroValue( + const size_t k, + const OutputType one + ) const noexcept { + assert( k < nonzeroes() ); + assert( _buffered_coordinates.size() > 0 ); + if( _buffered_values == nullptr ) { + return one; + } else { + const size_t index = getNonzeroIndex( k ); + return static_cast< OutputType >( + _buffered_values[ index ] + ); + } + } + + /** \internal No implementation details */ + inline IOType getNonzeroValue( + const size_t k + ) const noexcept { + assert( k < nonzeroes() ); + assert( _buffered_coordinates.size() > 0 ); + assert( _buffered_values != nullptr ); + const size_t index = getNonzeroIndex( k ); + assert( index < _buffered_coordinates.size() ); + return _buffered_values[ index ]; + } + + /** \internal No implementation details */ + inline size_t getNonzeroIndex( + const size_t k + ) const noexcept { + assert( k < nonzeroes() ); + return _buffered_coordinates.index( k ); + } + + }; + +} // namespace grb + +#endif // end ``_H_GRB_ASCEND_PINNEDVECTOR'' + diff --git a/include/graphblas/ascend/pipeline.hpp b/include/graphblas/ascend/pipeline.hpp new file mode 100644 index 000000000..b6286ca58 --- /dev/null +++ b/include/graphblas/ascend/pipeline.hpp @@ -0,0 +1,102 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Describes an Ascend pipeline. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_ALP_ASCEND_PIPELINE +#define _H_ALP_ASCEND_PIPELINE + +/** + * To enable debugging information only for the ascend backend, the code + * should be combiled with the _ASCEND_DEBUG definition, without defining + * _DEBUG. If the code is compiled with _DEBUG, the debugging information for + * the ascend backend is enabled as well. + */ +#if !defined(_ASCEND_DEBUG) && defined(_DEBUG) + #define _ASCEND_DEBUG +#endif + +#include +#include +#include +#include + +#include +#include +#include + +namespace alp { + + namespace internal { + + class Stage; + + /** + * Encodes a single pipeline that may be expanded, merged, or executed. + */ + class AscendPipeline { + + private: + + const size_t id; + std::vector< alp::internal::Stage > stages; + + // pointers to Tensors do not work because any local declaration + // inside the forEach will be invalid the moment the code is generated + std::unordered_set< Tensor > accessed; + std::unordered_set< Tensor > outputs; + + void insertTensorToInputs( const Tensor &tensor ); + std::set< int > getIteratedAxes() const; + + + public: + + AscendPipeline( size_t _id ); + AscendPipeline( size_t _id, const std::vector< int > &_forEachParallelAxes ); + void insertFreeInputTensorStages( const std::vector< int > &forEachAxes ); + const Tensor &store( const Tensor &output_tensor ); + bool isOutput( const Tensor &tensor ) const; + void clear(); + size_t getID() const; + std::string getTilingAxes() const; + void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const double alpha, const std::vector< int > &forEachAxes ); + void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ); + void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ); + void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const Tensor &tensor3, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ); +// void addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const Tensor &tensor1, const Tensor &tensor2, const Tensor &tensor3, const Tensor &tensor4, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ); + void generateDeclarations( std::stringstream &declarations ); +// void generateConstructor( std::stringstream &constructor ); + void generateHostBody( std::stringstream &os, std::stringstream &analyticModelArgs, std::stringstream &analyticModelFormalParams, std::stringstream &analyticModelDecls, std::stringstream &analyticModelConstrBody ); + void generateInit( std::stringstream &init ); + void generateProcess( std::stringstream &process, std::stringstream &processCall ); + void debug_print() const; + }; + + } + +} + +#endif //end `_H_ALP_ASCEND_PIPELINE' + diff --git a/include/graphblas/ascend/properties.hpp b/include/graphblas/ascend/properties.hpp new file mode 100644 index 000000000..a9f009da4 --- /dev/null +++ b/include/graphblas/ascend/properties.hpp @@ -0,0 +1,58 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Collects the Ascend backend properties. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_PROPERTIES +#define _H_GRB_ASCEND_PROPERTIES + +#include + + +namespace grb { + + /** No implementation notes. */ + template<> + class Properties< ascend > { + + public: + + /** + * This is a shared-memory parallel implementation and therefore captured + * scalars cannot be written to without causing data races. + */ + static constexpr const bool writableCaptured = false; + + /** This is a nonblocking backend. */ + static constexpr const bool isBlockingExecution = false; + + /** This is a nonblocking backend. */ + static constexpr const bool isNonblockingExecution = true; + + }; + +} // namespace grb + +#endif // end `_H_GRB_ASCEND_PROPERTIES + diff --git a/include/graphblas/ascend/semantics.hpp b/include/graphblas/ascend/semantics.hpp new file mode 100644 index 000000000..2f48438c1 --- /dev/null +++ b/include/graphblas/ascend/semantics.hpp @@ -0,0 +1,15 @@ +#ifndef _H_ALP_ASCEND_SEMANTICS +#define _H_ALP_ASCEND_SEMANTICS + +namespace alp { + + namespace internal { + + bool invalidForEachAxes( const std::vector< int > &axes ); + bool invalidAxes( const std::vector< int > &axes ); + + } // namespace internal + +} // namespace alp + +#endif // _H_ALP_ASCEND_SEMANTICS diff --git a/include/graphblas/ascend/spmd.hpp b/include/graphblas/ascend/spmd.hpp new file mode 100644 index 000000000..4f0453071 --- /dev/null +++ b/include/graphblas/ascend/spmd.hpp @@ -0,0 +1,68 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Provides the SPMD functions for the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_SPMD +#define _H_GRB_ASCEND_SPMD + +#include //size_t + +#include + + +namespace grb { + + /** The spmd class is based on that of the reference backend */ + template<> + class spmd< ascend > { + + public: + + /** Refers back to the reference backend */ + static inline size_t nprocs() noexcept { + return spmd< reference >::nprocs(); + } + + /** Refers back to the reference backend */ + static inline size_t pid() noexcept { + return spmd< reference >::pid(); + } + + /** Refers back to the reference backend */ + static RC sync( const size_t msgs_in = 0, const size_t msgs_out = 0 ) noexcept { + return spmd< reference >::sync( msgs_in, msgs_out ); + } + + /** Refers back to the reference backend */ + static RC barrier() noexcept { + return spmd< reference >::barrier(); + } + + }; // end class ``spmd'' ascend implementation + +} // namespace grb + +#endif // end _H_GRB_ASCEND_SPMD + diff --git a/include/graphblas/ascend/stage.hpp b/include/graphblas/ascend/stage.hpp new file mode 100644 index 000000000..100f40e68 --- /dev/null +++ b/include/graphblas/ascend/stage.hpp @@ -0,0 +1,84 @@ +#ifndef _H_ALP_ASCEND_STAGE +#define _H_ALP_ASCEND_STAGE + +#include +#include + +#include +#include +#include + +namespace alp { + + namespace internal { + + class Stage { + + private: + + const AscendPipeline &pipeline; + const Stagetype enum_op_type; + const Rule rule; + const Tensor tensor0; + Tensor tensor1; + Tensor tensor2; + Tensor tensor3; + std::string tensor0_offset; + std::string tensor1_offset; + std::string tensor2_offset; +// std::string tensor3_offset; + std::string stride; + double alpha; //TODO double should be replaced by alp::Scalar + const std::vector< int > activeAxes; + const std::vector< int > forEachAxes; + + + public: + + Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule, + const Tensor &_tensor0, const double _alpha, const std::vector< int > &_forEachAxes ); + Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule, + const Tensor &_tensor0, + const std::vector< int > &_activeAxes, const std::vector< int > &_forEachAxes ); + Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule, + const Tensor &_tensor0, const Tensor &_in_tensor, + const std::vector< int > &_activeAxes, const std::vector< int > &_forEachAxes ); + Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule, + const Tensor &_tensor0, const Tensor &_tensor1, const Tensor &_tensor2, + const std::vector< int > &_activeAxes, const std::vector< int > &_forEachAxes ); +// Stage( const AscendPipeline &parent, Stagetype _enum_op_type, Rule _rule, +// const Tensor &_tensor0, const Tensor &_tensor1, const Tensor &_tensor2, const Tensor &_tensor3, +// const std::vector< int > &_activeAxes, const std::vector< int > &_forEachAxes ); + Stagetype getOpType() const; + Rule getRule() const; + const Tensor &getTensor0() const; + const std::vector< int > &getAxes() const; + const std::vector< int > &getForEachAxes() const; + std::string getOp( const std::string &tabs ) const; + std::string generateApplyMinusOp( const std::string &tabs ) const; + std::string generateApplyAddOp( const std::string &tabs ) const; + std::string generateFoldlDivideOp( const std::string &tabs ) const; + std::string generateFoldlMaxOp( const std::string &tabs ) const; + std::string generateFoldlTimesOp( const std::string &tabs ) const; + std::string generateFoldlAddOp( const std::string &tabs ) const; + std::string generateFoldlExpOp( const std::string &tabs ) const; + std::string generateSetTensorOp( const std::string &tabs ) const; + std::string generateSetScalarOp( const std::string &tabs ) const; + std::string generateGetViewOp( const std::string &tabs ) const; + std::string generateStoreOp( const std::string &tabs ) const; + std::string generateImplicitFreeOp( const std::string &tabs ) const; + std::string generateToDoOp( const std::string &tabs ) const; + + + private: + + std::vector< int > computeOperatorAxes() const; + void computeMemoryOffsets(); + void semanticsCheks(); + }; + } + +} + +#endif // end _H_ALP_ASCEND_STAGE + diff --git a/include/graphblas/ascend/symbolTable.hpp b/include/graphblas/ascend/symbolTable.hpp new file mode 100644 index 000000000..b9bc598a0 --- /dev/null +++ b/include/graphblas/ascend/symbolTable.hpp @@ -0,0 +1,109 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _H_GRB_ASCEND_SYMBOLTABLE +#define _H_GRB_ASCEND_SYMBOLTABLE + +#include +#include +#include +#include +#include + +#include +#include + +namespace alp { + + class Tensor; + + namespace internal { + + class SymbolTable { + + private: + + bool TBuf_decl; + + /** Maintains a counter for unique temporary scalar names. */ + size_t temp_scalar_id; + + // pointers to Tensors do not work because any local declaration + // inside the forEach will be invalid the moment the code is generated + + /** Maintains all the global declarations of the compiled function. */ + std::map< std::string, alp::Tensor > global_tensor_declarations; + + /** Maintains all the local declarations of the current forEach. */ + std::map< std::string, alp::Tensor > local_tensor_declarations; + + /** Maintains all the temporary declarations of the current forEach. */ + std::map< std::string, alp::Tensor > temp_tensor_declarations; + + /** Maintains all the buffers that are reused for local + * and temporary declarations of the current forEach. + */ + std::map< std::string, std::string > temp_local_buffer_declarations; + + /** Maintains the order of all the global tensors and only the output tensors, respectively */ + std::vector< alp::Tensor > all_global_tensors; + std::vector< alp::Tensor > outputs_global_tensors; + + /** + * Maintains a mapping from chunks to vectors. + * + * \warning The map does not guarantee that chunks who have since been + * destructed will no longer appear in the map. + */ + std::map< std::string, std::string > viewToTensor; + + + public: + + SymbolTable(); + bool existsTBufTensorDecl() const; + void clearAll(); + + void addGlobalTensor( const alp::Tensor &t ); + void addLocalTensor( const alp::Tensor &t ); + void addTempTensor( const alp::Tensor &t ); + void addTensorView( const std::string &view_name, const std::string &parent_name ); +// std::string newTempScalar(); + void addOutputTensor( const alp::Tensor &t ); + void printHostLogFile( std::stringstream &listOfGlobalTensors ); + std::string getLocalTempTensorBuffer( Datatype type, const std::string &size = "" ); + void generateGlobalSymbols( std::stringstream &initFormalParam, + std::stringstream &customFormalParam, + std::stringstream &allAccessedArg, + std::stringstream &allTempLocalDecl ) const; + void generateTempLocalInit( std::stringstream &allTempLocalInit ) const; + const alp::Tensor &getTensorFromView( const alp::Tensor &tensor ) const; + + void debug_print() const; + + private: + + + void reuseLocalTempTensorBuffer( const alp::Tensor &t ); + }; + + } + +} + +#endif //end `_H_GRB_ASCEND_SYMBOLTABLE' + diff --git a/include/graphblas/ascend/tensor.hpp b/include/graphblas/ascend/tensor.hpp new file mode 100644 index 000000000..ab3e4d3a3 --- /dev/null +++ b/include/graphblas/ascend/tensor.hpp @@ -0,0 +1,130 @@ +#ifndef _H_ALP_ASCEND_TENSOR +#define _H_ALP_ASCEND_TENSOR + +#include +#include +#include +#include + +#include + +#include +#include + + +namespace alp { + + /** + * A global ALP/Ascend vector that resides in Ascend memory. + */ + + class Tensor { + + private: + + size_t id; + std::string name; + Datatype type; + internal::Scope scope; + std::vector< int > axes; + + Tensor& access( const std::vector &axes ) noexcept { + (void) axes; + return *this; + } + + + public: + + /** Maintains a counter for unique tensor names. */ + static size_t tensor_id; + + bool operator==( const Tensor &t ) const; + bool operator!=( const Tensor &t ) const { return not ( *this == t ); } + + /** + * @deprecated + * + * @brief Tensor[i] operator is deprecated. Use Tensor(i, ...) instead + */ + template< typename T, typename U > + T operator[]( const U axis ) = delete; + + /** + * @brief Replacement for Tensor[i] operator, allows to specify multiple + * axes in any order. + */ + template< typename AnyType > + Tensor& operator()( const AnyType &axis ) { + std::vector computedAxes{ getAxisId( axis ) }; + return access( computedAxes ); + } + + /** + * @brief Replacement for Tensor[i] operator, allows to specify multiple + * axes in any order. + */ + template< typename AnyType, typename... AnyPackType > + Tensor& operator()( const AnyType &axis, AnyPackType const... args ) { + std::vector computedAxes{ getAxisId( axis ) }; + for( auto arg : { args... } ) { + computedAxes.push_back( getAxisId( arg ) ); + } + return access( computedAxes ); + } + + /** + * @brief Assignment operator of a Tensor (deleted) + */ + void operator=( const Tensor& ) = delete; + + /** + * @brief Assignment operator of ReductionOperation + */ + void operator=( const ReductionOperation &op ); + + /** + * @brief Assignment operator of ApplyOperation + */ + void operator=( const ApplyOperation &op ); + + Tensor() = default; + Tensor( const Tensor &view_parent, const std::vector< int > &_axes ) noexcept; + Tensor( const Tensor &t ) noexcept; + Tensor( const std::vector< int > &_axes, const Datatype type ) noexcept; + + Tensor( + const Datatype type, + const std::vector< int > &axes_vector + ) noexcept; + + virtual ~Tensor() noexcept { } + + size_t getID() const; + const std::string &getName() const; + alp::Datatype getType() const; + internal::Scope getScope() const; + const std::vector< int > &getAxes() const; + bool isGlobalDecl() const; + bool isLocalDecl() const; + bool isTempDecl() const; + + std::string getAccessedElement( size_t id ) const; + std::string getAscendName( size_t id ) const; + std::string getAscendGlobalName( size_t id ) const; + std::string getTQueBufName( size_t id ) const; + }; +} + + +template<> +struct std::hash< alp::Tensor > +{ + std::size_t operator()( const alp::Tensor& tensor ) const noexcept + { + return std::hash< std::string >{}( tensor.getName() ); + + } +}; + +#endif diff --git a/include/graphblas/ascend/utils.hpp b/include/graphblas/ascend/utils.hpp new file mode 100644 index 000000000..81f24e1eb --- /dev/null +++ b/include/graphblas/ascend/utils.hpp @@ -0,0 +1,122 @@ +#ifndef _H_ALP_ASCEND_UTILS +#define _H_ALP_ASCEND_UTILS + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace alp { + + template< class T > + constexpr T Zero = T( 0 ); + + template< class T > + constexpr T Infinity = std::numeric_limits< T >::infinity(); + + // TODO: fix this so that mInfinity=-Infinity + template< class T > + constexpr T mInfinity = -Infinity< T >; + + enum class Datatype { FP16, FP32, VIEW_TYPE, NO_TYPE }; + + namespace internal { + + enum class Rule { + + NONE, + EWISE, + BCAST, + REDUCE + }; + + enum class Scope { + + GLOBAL, + LOCAL, + TEMP, + VIEW + }; + + enum class Stagetype { + + GET_VIEW, + STORE, + IMPLICIT_FREE, + SET_TENSOR, + SET_SCALAR, + APPLY_ADD, + APPLY_MINUS, + FOLDL_EXP, + FOLDL_DIVIDE, + FOLDL_MAX, + FOLDL_TIMES, + FOLDL_ADD + }; + + + std::string getDataType( const Datatype dtype ); + std::string getScope( const Scope scope ); + std::vector< int > vectorOfVectorsToVector( const std::vector< std::vector< int > > &vector_of_sets ); + std::vector< int > vectorDifference( const std::vector< int > &vector1, const std::vector< int > &vector2 ); + bool vectorSubset( const std::vector< int > &vector1, const std::vector< int > &vector2 ); + std::vector< int > vectorUnion( const std::vector< int > &vector1, const std::vector< int > &vector2 ); + + } // namespace internal + + + static std::atomic_int axes_counter{0}; + + + static inline int getAxisId( const std::string &axis ) { + static std::unordered_map associations; + + if (associations.find(axis) == associations.end()) { + associations[axis] = axes_counter++; + } + + return associations[axis]; + } + + template< typename IntegralType = int > + static inline int getAxisId( + const IntegralType axis, + typename std::enable_if< std::is_integral< IntegralType >::value, int >::type* = 0 + ) { + return static_cast< int >( axis ); + } + + static inline int getAxisId( const char* axis ) { + return getAxisId( std::string( axis ) ); + } + + template< typename = void > + static inline std::vector< int > make_axes( ) { + return std::vector< int >(0); + } + + template< typename AxisType > + static inline std::vector< int > make_axes( AxisType axis ) { + return std::vector< int >{ getAxisId( axis ) }; + } + + template< typename AxisType, typename... AxisPackType > + static std::vector< int > make_axes( const AxisType arg1, AxisPackType const... args ) { + std::vector< int > axes{ getAxisId( arg1 ) }; + + for( auto arg : { args... } ) { + axes.push_back( getAxisId( arg ) ); + } + + return axes; + } + + +} // namespace alp + +#endif // _H_ALP_ASCEND_UTILS diff --git a/include/graphblas/ascend/vector.hpp b/include/graphblas/ascend/vector.hpp new file mode 100644 index 000000000..2072ba0bf --- /dev/null +++ b/include/graphblas/ascend/vector.hpp @@ -0,0 +1,480 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Provides the Ascend vector. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_VECTOR +#define _H_GRB_ASCEND_VECTOR + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "coordinates.hpp" +#include "spmd.hpp" + +#define NO_CAST_ASSERT( x, y, z ) \ + static_assert( x, \ + "\n\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* ERROR | " y " " z ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* Possible fix 1 | Remove no_casting from the template parameters " \ + "in this call to " y ".\n" \ + "* Possible fix 2 | Provide a value of the same type as the first " \ + "domain of the given accumulator.\n" \ + "* Possible fix 3 | Provide a compatible accumulator where the first " \ + "domain is of the type of the given value in the template paramters " \ + "of this call to " y ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" ); + +#define NO_MASKCAST_ASSERT( x, y, z ) \ + static_assert( x, \ + "\n\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* ERROR | " y " " z ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" \ + "* Possible fix 1 | Remove no_casting from the template parameters " \ + "in this call to " y ".\n" \ + "* Possible fix 2 | Provide a vector of Booleans in this call to " y ".\n" \ + "********************************************************************" \ + "********************************************************************" \ + "******************************\n" ); + + +namespace grb { + + namespace internal { + + extern LazyEvaluation le; + + } + +} + +namespace grb { + + // forward declaration of backend-local matrix specialization for vector's + // friends + template< typename D, typename RIT, typename CIT, typename NIT > + class Matrix< D, ascend, RIT, CIT, NIT >; + + // forward-declare internal getters + namespace internal { + + template< typename D, typename C > + inline C & getCoordinates( Vector< D, ascend, C > &x ) noexcept; + + template< typename D, typename C > + inline const C & getCoordinates( + const Vector< D, ascend, C > &x + ) noexcept; + + template< typename D, typename C > + inline D * getRaw( Vector< D, ascend, C > &x ) noexcept; + + template< typename D, typename C > + inline const D * getRaw( const Vector< D, ascend, C > &x ) noexcept; + + template< typename D, typename RIT, typename CIT, typename NIT > + inline internal::Compressed_Storage< D, RIT, NIT > & getCRS( + Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept; + + template< typename D, typename RIT, typename CIT, typename NIT > + inline const internal::Compressed_Storage< D, RIT, NIT > & getCRS( + const Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept; + + template< typename D, typename RIT, typename CIT, typename NIT > + inline internal::Compressed_Storage< D, CIT, NIT > & getCCS( + Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept; + + template< typename D, typename RIT, typename CIT, typename NIT > + inline const internal::Compressed_Storage< D, CIT, NIT > & getCCS( + const Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept; + + template< typename D, typename C > + inline Vector< D, reference, C >& getRefVector( + Vector< D, ascend, C > &x ) noexcept; + + template< typename D, typename C > + inline const Vector< D, reference, C >& getRefVector( + const Vector< D, ascend, C > &x ) noexcept; + + } // namespace internal + + template< typename D, typename MyCoordinates > + class Vector< D, ascend, MyCoordinates > { + + static_assert( !grb::is_object< D >::value, "Cannot create an ALP/GraphBLAS" + "vector of ALP/GraphBLAS objects!" ); + + /* ********************* + `Getter' friends + ********************* */ + + friend MyCoordinates & internal::getCoordinates< D, MyCoordinates >( + Vector< D, ascend, MyCoordinates > & x ) noexcept; + + friend const MyCoordinates & internal::getCoordinates< D, MyCoordinates >( + const Vector< D, ascend, MyCoordinates > & x ) noexcept; + + friend D * internal::getRaw< D, MyCoordinates >( + Vector< D, ascend, MyCoordinates > & x ) noexcept; + + friend const D * internal::getRaw< D, MyCoordinates >( + const Vector< D, ascend, MyCoordinates > & x ) noexcept; + + friend Vector< D, reference, MyCoordinates > & internal::getRefVector<>( + Vector< D, ascend, MyCoordinates > &x ) noexcept; + + friend const Vector< D, reference, MyCoordinates > & internal::getRefVector<>( + const Vector< D, ascend, MyCoordinates > &x ) noexcept; + + /* ********************* + IO friends + ********************* */ + + friend class PinnedVector< D, ascend >; + + + private: + + Vector< D, reference, MyCoordinates > ref; + + + public: + + /** @see Vector::value_type. */ + typedef D value_type; + + /** + * This implementation makes the simplest implementation choice and declares + * a lambda reference to be of the same type as a regular C++ reference. The + * restrictions as specified in Vector::lambda_reference, however, still + * apply. + * + * @see Vector::lambda_reference for the user-level specification. + */ + typedef D & lambda_reference; + + typedef typename Vector< D, reference, MyCoordinates >::const_iterator + const_iterator; + + + Vector( const size_t n, const size_t nz ) : ref( n, nz ) {} + + Vector( const size_t n ) : Vector( n, n ) { + + // pipeline execution is not required here as this is a grb::Vector + // declaration +#ifdef _DEBUG + std::cerr << "In Vector< ascend >::Vector( size_t ) constructor\n"; +#endif + } + + Vector() : Vector( 0 ) {} + + Vector( const Vector< D, ascend, MyCoordinates > &x ) : + ref( size( x.ref ), capacity( x.ref ) ) + { + // full delegation to the copy constructor of the reference backend is + // impossible since the pipeline must be executed before the copy + // constructor + // instead a parameterized constructor of the reference backend is invoked + // to perform the necessary initialization as the initialize method is not + // defined for the ascend backend + if( internal::getCoordinates( x ).size() > 0 ) { + internal::le.execution( &x ); + } + + + // once the execution of any required pipeline is completed + // the set primitive initializes the vector for this copy constructor + if( size( x ) > 0 ) { + const RC rc = set( *this, x ); + if( rc != SUCCESS ) { + throw std::runtime_error( "grb::set inside copy-constructor: " + + toString( rc ) ); + } + } + } + + Vector( Vector< D, ascend, MyCoordinates > &&x ) noexcept { + + if( internal::getCoordinates( x ).size() > 0 ) { + internal::le.execution( &x ); + } + + ref = std::move( x.ref ); + } + + Vector< D, ascend, MyCoordinates > & operator=( + const Vector< D, ascend, MyCoordinates > &x + ) { + const RC rc = set( *this, x ); + if( rc != grb::SUCCESS ) { + throw std::runtime_error( grb::toString( rc ) ); + } + return *this; + } + + Vector< D, ascend, MyCoordinates > & operator=( + Vector< D, ascend, MyCoordinates > &&x + ) noexcept { + if( internal::getCoordinates( x ).size() > 0 ) { + internal::le.execution( &x ); + } + ref = std::move( x.ref ); + return *this; + } + + ~Vector() { + /* TODO this interferes with opgen + if( internal::getCoordinates( *this ).size() > 0 ) { + internal::le.execution( this ); + }*/ + } + + const_iterator begin( + const size_t s = 0, const size_t P = 1 + ) const { + if( internal::getCoordinates( *this ).size() > 0 ) { + internal::le.execution( this ); + } + + return ref.begin(s, P); + } + + const_iterator end( + const size_t s = 0, const size_t P = 1 + ) const { + if( internal::getCoordinates( *this ).size() > 0 ) { + internal::le.execution( this ); + } + + return ref.end(s, P); + } + + const_iterator cbegin( + const size_t s = 0, const size_t P = 1 + ) const { + if( internal::getCoordinates( *this ).size() > 0 ) { + internal::le.execution( this ); + } + + return ref.cbegin(s, P); + } + + const_iterator cend( + const size_t s = 0, const size_t P = 1 + ) const { + if( internal::getCoordinates( *this ).size() > 0 ) { + internal::le.execution( this ); + } + + return ref.cend(s, P); + } + + template< Descriptor descr = descriptors::no_operation, + typename mask_type, + class Accum, + typename ind_iterator = const size_t * __restrict__, + typename nnz_iterator = const D * __restrict__, + class Dup = operators::right_assign< + D, typename nnz_iterator::value_type, D + > + > + RC build( + const Vector< mask_type, ascend, MyCoordinates > &mask, + const Accum &accum, + const ind_iterator ind_start, + const ind_iterator ind_end, + const nnz_iterator nnz_start, + const nnz_iterator nnz_end, + const Dup &dup = Dup() + ) { + return ref.build( mask.ref, accum, ind_start, ind_end, nnz_start, nnz_end, + dup ); + } + + template< + Descriptor descr = descriptors::no_operation, + class Accum = operators::right_assign< D, D, D >, + typename T, typename mask_type = bool + > + RC assign( + const T &val, + const Vector< mask_type, ascend, MyCoordinates > &mask, + const Accum &accum = Accum() + ) { + return ref.assign( val, mask.ref, accum ); + } + + template< typename T > + RC nnz( T &nnz ) const { + if( internal::getCoordinates( *this ).size() > 0 ) { + internal::le.execution( this ); + } + + return ref.nnz( nnz ); + } + + D * raw() const { + return ref.raw(); + } + + lambda_reference operator[]( const size_t i ) { + return ref[ i ]; + } + + lambda_reference operator[]( const size_t i ) const { + return ref[ i ]; + } + + }; + + // specialisation for GraphBLAS type_traits + template< typename D, typename Coord > + struct is_container< Vector< D, ascend, Coord > > { + /** A ascend vector is a GraphBLAS object. */ + static const constexpr bool value = true; + }; + + // internal getters implementation + namespace internal { + + template< typename D, typename C > + inline C & getCoordinates( Vector< D, ascend, C > &x ) noexcept { + return internal::getCoordinates( x.ref ); + } + + template< typename D, typename C > + inline const C & getCoordinates( + const Vector< D, ascend, C > &x + ) noexcept { + return internal::getCoordinates( x.ref ); + } + + template< typename D, typename C > + inline D * getRaw( Vector< D, ascend, C > &x ) noexcept { + return getRaw( x.ref ); + } + + template< typename D, typename C > + inline const D * getRaw( const Vector< D, ascend, C > &x ) noexcept { + return getRaw( x.ref ); + } + + template< typename D, typename RIT, typename CIT, typename NIT > + inline internal::Compressed_Storage< D, RIT, NIT > & getCRS( + Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept { + return getCRS( A.ref ); + } + + template< typename D, typename RIT, typename CIT, typename NIT > + inline const internal::Compressed_Storage< D, RIT, NIT > & getCRS( + const Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept { + return getCRS( A.ref ); + } + + template< typename D, typename RIT, typename CIT, typename NIT > + inline internal::Compressed_Storage< D, CIT, NIT > & getCCS( + Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept { + return getCCS( A.ref ); + } + + template< typename D, typename RIT, typename CIT, typename NIT > + inline const internal::Compressed_Storage< D, CIT, NIT > & getCCS( + const Matrix< D, ascend, RIT, CIT, NIT > &A + ) noexcept { + return getCCS( A.ref ); + } + + template< typename D, typename C > + inline Vector< D, reference, C >& getRefVector( + Vector< D, ascend, C > &x + ) noexcept { + return x.ref; + } + + template< typename D, typename C > + inline const Vector< D, reference, C >& getRefVector( + const Vector< D, ascend, C > &x + ) noexcept { + return x.ref; + } + + } // namespace internal + +} // namespace grb + +#undef NO_CAST_ASSERT +#undef NO_MASKCAST_ASSERT + +#endif // end ``_H_GRB_ASCEND_VECTOR'' + diff --git a/include/graphblas/ascend/vector_wrapper.hpp b/include/graphblas/ascend/vector_wrapper.hpp new file mode 100644 index 000000000..ac10ac2fe --- /dev/null +++ b/include/graphblas/ascend/vector_wrapper.hpp @@ -0,0 +1,192 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Provides a wrapper to a scalar or a vector, for those primitives that could + * take either. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#ifndef _H_GRB_ASCEND_VECTOR_WRAPPER +#define _H_GRB_ASCEND_VECTOR_WRAPPER + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "coordinates.hpp" +#include "vector.hpp" +#include "blas1.hpp" + + +namespace grb { + + namespace internal { + + /** + * A wrapper class used to store a scalar value, which is passed by value to + * an internal function used by an ALP/GraphBLAS operation. The wrapper + * classes are used by operations that may have a formal parameter that is + * either a scalar or a vector, because the implementation is generic and + * handles all possible cases. + */ + template< bool scalar, typename InputType, typename CoordinatesType > + class Wrapper { + + private: + + /** + * \warning This is not a reference, since the semantics are that the + * \em current scalar value is used. + */ + InputType val; + + + public: + + /** Base constructor that copies the input scalar. */ + Wrapper(const InputType &value) : val( value ) {} + + /** Default copy constructor. */ + Wrapper( const Wrapper< scalar, InputType, CoordinatesType > & ) = default; + + /** + * @returns nullptr + * + * This function returns a raw array for vectors only). + */ + constexpr InputType * getRaw() const { + return nullptr; + } + + /** + * @returns nullptr + * + * This function returns coordinates only for vectors. + */ + constexpr CoordinatesType * getCoordinates() const { + return nullptr; + } + + /** + * @returns nullptr + * + * This function returns a vector pointer only when wrapping a vector. + */ + constexpr Vector< InputType, ascend, CoordinatesType > * getPointer() + const + { + return nullptr; + } + + /** + * @returns The scalar value it wraps. + */ + const InputType & getValue() const { + return val; + } + + /** + * @returns Whether the underlying container is dense. + */ + bool isDense() const { + return true; + } + + }; + + /** + * A wrapper class used to store a vector, which is passed by reference to an + * internal function used by an ALP/GraphBLAS operation. The wrapper classes + * are used by by operations that may have a formal parameter that is either a + * scalar or a vector, because the implementation is generic and handles all + * possible cases. + */ + template< typename InputType, typename CoordinatesType > + class Wrapper< false, InputType, CoordinatesType > { + + private: + + /** A reference to the vector this class wraps. */ + const Vector< InputType, ascend, CoordinatesType > &vec; + + + public: + + /** Base constructor wrapping arund a given \a vector. */ + Wrapper( const Vector< InputType, ascend, CoordinatesType > &vector ) : + vec( vector ) + {} + + /** Copy constructor. */ + Wrapper( const Wrapper< false, InputType, CoordinatesType > &w ) : + vec( w.vec ) + {} + + /** @returns The underlying raw value array. */ + const InputType * getRaw() const { + return internal::getRaw( vec ); + } + + /** @returns The underlying coordinates instance. */ + const CoordinatesType * getCoordinates() const { + return &internal::getCoordinates( vec ); + } + + /** @returns The underlying vector (a pointer to it). */ + const Vector< InputType, ascend, CoordinatesType > * getPointer() + const + { + return &vec; + } + + /** + * @returns a possibly unitialised value that is not intended to be + * consumed. + * + * \warning This function should only be called on wrappers of scalars. + */ + const InputType & getValue() const { + // this is a trick to avoid compilation errors, this value will never be + // used in practice + return *( getRaw( ) ); + } + + /** + * @returns Whether the underlying vector is dense. + */ + bool isDense() const { + return internal::getCoordinates( vec ).isDense(); + } + }; + + } // end namespace ``internal'' + +} // end namespace ``grb'' + +#endif + diff --git a/include/graphblas/backends.hpp b/include/graphblas/backends.hpp index 653348112..5c5770377 100644 --- a/include/graphblas/backends.hpp +++ b/include/graphblas/backends.hpp @@ -75,6 +75,14 @@ namespace grb { */ nonblocking, + /** + * The Ascend backend. This is not a true ALP implementation -- programs + * compiled using this backend generate AscendC code. This backend may require + * the use of ALP/Ascend-specific primitives as it currently operates without + * compiler support. + */ + ascend, + /** * \internal * A shared-memory parallel distribution based on a row-wise 1D block-cyclic diff --git a/include/graphblas/benchmark.hpp b/include/graphblas/benchmark.hpp index 81bd67773..81e46c251 100644 --- a/include/graphblas/benchmark.hpp +++ b/include/graphblas/benchmark.hpp @@ -36,6 +36,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/benchmark.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/benchmark.hpp" +#endif #ifdef _GRB_WITH_BANSHEE #include "graphblas/banshee/benchmark.hpp" #endif diff --git a/include/graphblas/blas1.hpp b/include/graphblas/blas1.hpp index e28c9e8ad..d33a51cf3 100644 --- a/include/graphblas/blas1.hpp +++ b/include/graphblas/blas1.hpp @@ -34,6 +34,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/blas1.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/blas1.hpp" +#endif #ifdef _GRB_WITH_BANSHEE #include #endif diff --git a/include/graphblas/blas2.hpp b/include/graphblas/blas2.hpp index 2a0b1338e..2f0afbc43 100644 --- a/include/graphblas/blas2.hpp +++ b/include/graphblas/blas2.hpp @@ -39,6 +39,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/blas2.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/blas2.hpp" +#endif #ifdef _GRB_WITH_BANSHEE #include #endif diff --git a/include/graphblas/blas3.hpp b/include/graphblas/blas3.hpp index 6ed90264b..77dfcbb8c 100644 --- a/include/graphblas/blas3.hpp +++ b/include/graphblas/blas3.hpp @@ -36,6 +36,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/blas3.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/blas3.hpp" +#endif #ifdef _GRB_WITH_LPF #include #endif diff --git a/include/graphblas/collectives.hpp b/include/graphblas/collectives.hpp index 8ca63fd3e..e3f839c27 100644 --- a/include/graphblas/collectives.hpp +++ b/include/graphblas/collectives.hpp @@ -36,6 +36,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/collectives.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/collectives.hpp" +#endif #ifdef _GRB_WITH_LPF #include #endif diff --git a/include/graphblas/config.hpp b/include/graphblas/config.hpp index d7c2a650f..8a3652ca7 100644 --- a/include/graphblas/config.hpp +++ b/include/graphblas/config.hpp @@ -35,6 +35,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/config.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/config.hpp" +#endif #ifdef _GRB_WITH_OMP #include "graphblas/omp/config.hpp" #endif diff --git a/include/graphblas/coordinates.hpp b/include/graphblas/coordinates.hpp index 43f5c9845..59e80adad 100644 --- a/include/graphblas/coordinates.hpp +++ b/include/graphblas/coordinates.hpp @@ -32,8 +32,8 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/coordinates.hpp" #endif -#ifdef _GRB_WITH_LPF -// #include +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/coordinates.hpp" #endif #ifdef _GRB_WITH_BANSHEE #include diff --git a/include/graphblas/exec.hpp b/include/graphblas/exec.hpp index f7ecb8cc2..64477bbef 100644 --- a/include/graphblas/exec.hpp +++ b/include/graphblas/exec.hpp @@ -36,6 +36,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/exec.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/exec.hpp" +#endif #ifdef _GRB_WITH_LPF #include "graphblas/bsp1d/exec.hpp" #endif diff --git a/include/graphblas/identities.hpp b/include/graphblas/identities.hpp index fdbb7c7f7..6478eb7ab 100644 --- a/include/graphblas/identities.hpp +++ b/include/graphblas/identities.hpp @@ -131,14 +131,19 @@ namespace grb { * `minus infinity'. */ static constexpr D value() { - return std::numeric_limits< D >::min() == 0 ? 0 : ( std::numeric_limits< D >::has_infinity ? -std::numeric_limits< D >::infinity() : std::numeric_limits< D >::min() ); + return std::numeric_limits< D >::min() == 0 + ? 0 + : ( std::numeric_limits< D >::has_infinity + ? -std::numeric_limits< D >::infinity() + : std::numeric_limits< D >::min() ); } }; template< typename K, typename V > class negative_infinity< std::pair< K, V > > { public: static constexpr std::pair< K, V > value() { - return std::make_pair( negative_infinity< K >::value(), negative_infinity< V >::value() ); + return std::make_pair( negative_infinity< K >::value(), + negative_infinity< V >::value() ); } }; @@ -149,9 +154,11 @@ namespace grb { */ template< typename D > class logical_false { - static_assert( std::is_convertible< bool, D >::value, "Cannot form identity under the requested domain" ); + static_assert( std::is_convertible< bool, D >::value, + "Cannot form identity under the requested domain" ); + + public: - public: /** * @tparam D The domain of the value to return. * @return The identity under the standard logical OR operator, i.e., @@ -176,7 +183,8 @@ namespace grb { */ template< typename D > class logical_true { - static_assert( std::is_convertible< bool, D >::value, "Cannot form identity under the requested domain" ); + static_assert( std::is_convertible< bool, D >::value, + "Cannot form identity under the requested domain" ); public: /** @@ -192,11 +200,13 @@ namespace grb { class logical_true< std::pair< K, V > > { public: static constexpr std::pair< K, V > value() { - return std::make_pair( logical_true< K >::value(), logical_true< V >::value() ); + return std::make_pair( logical_true< K >::value(), + logical_true< V >::value() ); } }; } // namespace identities + } // namespace grb #endif diff --git a/include/graphblas/init.hpp b/include/graphblas/init.hpp index dd34749ba..794f6c475 100644 --- a/include/graphblas/init.hpp +++ b/include/graphblas/init.hpp @@ -36,6 +36,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/init.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/init.hpp" +#endif #ifdef _GRB_WITH_LPF #include "graphblas/bsp1d/init.hpp" #endif diff --git a/include/graphblas/io.hpp b/include/graphblas/io.hpp index 8fbb70a13..a326a294c 100644 --- a/include/graphblas/io.hpp +++ b/include/graphblas/io.hpp @@ -35,6 +35,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/io.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/io.hpp" +#endif #ifdef _GRB_WITH_LPF #include #endif diff --git a/include/graphblas/matrix.hpp b/include/graphblas/matrix.hpp index bccb0f928..8e46ba8ef 100644 --- a/include/graphblas/matrix.hpp +++ b/include/graphblas/matrix.hpp @@ -36,6 +36,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/matrix.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/matrix.hpp" +#endif #ifdef _GRB_WITH_LPF #include #endif diff --git a/include/graphblas/nonblocking/coordinates.hpp b/include/graphblas/nonblocking/coordinates.hpp index 1aca8e322..c24ca64a3 100644 --- a/include/graphblas/nonblocking/coordinates.hpp +++ b/include/graphblas/nonblocking/coordinates.hpp @@ -45,6 +45,7 @@ #include #include +#include #include #include diff --git a/include/graphblas/nonblocking/lazy_evaluation.hpp b/include/graphblas/nonblocking/lazy_evaluation.hpp index 426f530fb..791ae122c 100644 --- a/include/graphblas/nonblocking/lazy_evaluation.hpp +++ b/include/graphblas/nonblocking/lazy_evaluation.hpp @@ -43,11 +43,18 @@ namespace grb { */ class LazyEvaluation { - private: + friend class alp::internal::OpGen; + + + //private: + public: // DBG /** Multiple pipelines may be maintained at any time. */ std::vector< Pipeline > pipelines; + + private: // DBG + /** Stores the pipelines that share data with the new stage. */ std::vector< std::vector< Pipeline >::iterator > shared_data_pipelines; @@ -116,10 +123,15 @@ namespace grb { const size_t data_type_size, const bool dense_descr, const bool dense_mask, + const size_t output_container_id, + // TODO FIXME is there really a need for pointers? void * const output_container_ptr, void * const output_aux_container_ptr, Coordinates< nonblocking > * const coor_output_ptr, Coordinates< nonblocking > * const coor_output_aux_ptr, + const size_t input_a_id, const size_t input_b_id, + const size_t input_c_id, const size_t input_d_id, + // TODO FIXME is there really a need for pointers? const void * const input_a_ptr, const void * const input_b_ptr, const void * const input_c_ptr, @@ -128,6 +140,8 @@ namespace grb { const Coordinates< nonblocking > * const coor_b_ptr, const Coordinates< nonblocking > * const coor_c_ptr, const Coordinates< nonblocking > * const coor_d_ptr, + const size_t input_matrix_id, + // TODO FIXME is there really a need for pointers? const void * const input_matrix ); diff --git a/include/graphblas/nonblocking/pipeline.hpp b/include/graphblas/nonblocking/pipeline.hpp index 62500d115..a1689a88c 100644 --- a/include/graphblas/nonblocking/pipeline.hpp +++ b/include/graphblas/nonblocking/pipeline.hpp @@ -72,6 +72,15 @@ #include "coordinates.hpp" +// TODO ugly hack, fwd declare ALP::internal::OpGen +namespace alp { + template< size_t process_order, size_t problem_order > + class Grid; + namespace internal { + class OpGen; + } +} + namespace grb { namespace internal { @@ -105,6 +114,11 @@ namespace grb { */ class Pipeline { + friend class alp::internal::OpGen; + template< size_t process_order, size_t problem_order > + friend + class alp::Grid; + public: // The pipeline is passed by reference such that an out-of-place operation @@ -119,9 +133,22 @@ namespace grb { size_t containers_size; size_t size_of_data_type; + + // per-stage data std::vector< stage_type > stages; + + + public: //DBG + std::vector< Opcode > opcodes; + + private: //DBG + + std::vector< std::vector< size_t > > stage_inputs; + std::vector< size_t > stage_output; + + // per-pipeline data std::set< Coordinates< nonblocking > * > accessed_coordinates; std::set< const void * > input_vectors; std::set< const void * > output_vectors; @@ -276,10 +303,17 @@ namespace grb { const size_t data_type_size, const bool dense_descr, const bool dense_mask, + const size_t output_vector_id, + // TODO FIXME is there really a need for pointers? void * const output_vector_ptr, void * const output_aux_vector_ptr, Coordinates< nonblocking > * const coor_output_ptr, Coordinates< nonblocking > * const coor_output_aux_ptr, + const size_t input_a_id, + const size_t input_b_id, + const size_t input_c_id, + const size_t input_d_id, + // TODO FIXME is there really a need for pointers? const void * const input_a_ptr, const void * const input_b_ptr, const void * const input_c_ptr, @@ -288,6 +322,8 @@ namespace grb { const Coordinates< nonblocking > * const coor_b_ptr, const Coordinates< nonblocking > * const coor_c_ptr, const Coordinates< nonblocking > * const coor_d_ptr, + const size_t input_matrix_id, + // TODO FIXME is there really a need for pointers? const void * const input_matrix ); diff --git a/include/graphblas/pinnedvector.hpp b/include/graphblas/pinnedvector.hpp index 380c53ae7..4bf106fbd 100644 --- a/include/graphblas/pinnedvector.hpp +++ b/include/graphblas/pinnedvector.hpp @@ -40,6 +40,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/pinnedvector.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/pinnedvector.hpp" +#endif #ifdef _GRB_WITH_LPF #include #endif diff --git a/include/graphblas/properties.hpp b/include/graphblas/properties.hpp index 864b849cd..effcded68 100644 --- a/include/graphblas/properties.hpp +++ b/include/graphblas/properties.hpp @@ -36,6 +36,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/properties.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/properties.hpp" +#endif #ifdef _GRB_WITH_LPF #include #endif diff --git a/include/graphblas/spmd.hpp b/include/graphblas/spmd.hpp index 88cef92bc..f65a8c1ea 100644 --- a/include/graphblas/spmd.hpp +++ b/include/graphblas/spmd.hpp @@ -35,6 +35,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/spmd.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/spmd.hpp" +#endif #ifdef _GRB_WITH_LPF #include "graphblas/bsp1d/spmd.hpp" #endif diff --git a/include/graphblas/tags.hpp b/include/graphblas/tags.hpp index 07d39eb2e..e1791b2be 100644 --- a/include/graphblas/tags.hpp +++ b/include/graphblas/tags.hpp @@ -42,6 +42,12 @@ namespace grb { * template to grb::Vector and grb::Matrix creates a combinatorial * explosion in the number of combinations that must be caught. * Are there better alternatives? + * + * Update 2023: yes there are, see Spampinato et al., ARRAY '23. This + * file will be removed in future releases when it is replaced by the + * concept of \em views and particular that of xMFs that prevent the + * feared combinatorial explosion, both introduced in the aforementioned + * paper. */ namespace tags { diff --git a/include/graphblas/utils/alloc.hpp b/include/graphblas/utils/alloc.hpp index 5943e5216..fd98b6b72 100644 --- a/include/graphblas/utils/alloc.hpp +++ b/include/graphblas/utils/alloc.hpp @@ -59,6 +59,9 @@ namespace grb { #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/alloc.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/alloc.hpp" +#endif #ifdef _GRB_WITH_LPF #include "graphblas/bsp1d/alloc.hpp" #endif diff --git a/include/graphblas/vector.hpp b/include/graphblas/vector.hpp index 5ac75b1b1..ee7613ea1 100644 --- a/include/graphblas/vector.hpp +++ b/include/graphblas/vector.hpp @@ -37,6 +37,9 @@ #ifdef _GRB_WITH_NONBLOCKING #include "graphblas/nonblocking/vector.hpp" #endif +#ifdef _GRB_WITH_ASCEND + #include "graphblas/ascend/vector.hpp" +#endif #ifdef _GRB_WITH_LPF #include #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 50e731a30..d032974d3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,15 +89,20 @@ list( JOIN compiler_list " " BACKEND_COMPILERS_SPACED ) list( JOIN compile_options_list " " BACKEND_CFLAGS_SPACED ) list( JOIN link_flags_list " " BACKEND_LFLAGS_SPACED ) configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/grbcxx.in ${CMAKE_CURRENT_BINARY_DIR}/grbcxx @ONLY ) +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/grbcxx.in ${CMAKE_CURRENT_BINARY_DIR}/alpcxx @ONLY ) list( JOIN runenv_list " " BACKEND_RUNENV_SPACED ) list( JOIN runner_list " " BACKEND_RUNNER_SPACED ) configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/grbrun.in ${CMAKE_CURRENT_BINARY_DIR}/grbrun @ONLY ) +configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/grbrun.in ${CMAKE_CURRENT_BINARY_DIR}/alprun @ONLY ) # install them to the install folder with execute permission install( FILES ${CMAKE_CURRENT_BINARY_DIR}/setenv ${CMAKE_CURRENT_BINARY_DIR}/grbcxx ${CMAKE_CURRENT_BINARY_DIR}/grbrun + ${CMAKE_CURRENT_BINARY_DIR}/alpcxx + ${CMAKE_CURRENT_BINARY_DIR}/alprun + ${CMAKE_CURRENT_BINARY_DIR}/ascendcc DESTINATION "${BIN_INSTALL_DIR}" PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE @@ -105,6 +110,16 @@ install( FILES ${CMAKE_CURRENT_BINARY_DIR}/setenv WORLD_READ WORLD_EXECUTE ) +if( WITH_ASCEND_BACKEND ) + configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/ascendcc.in ${CMAKE_CURRENT_BINARY_DIR}/ascendcc @ONLY ) + install( FILES ${CMAKE_CURRENT_BINARY_DIR}/ascendcc + DESTINATION "${BIN_INSTALL_DIR}" + PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE + ) +endif() + ### GENERATE CMAKE INFRASTRUCTURE INSIDE INSTALLATION DIRECTORY include(CMakePackageConfigHelpers) diff --git a/src/ascendcc.in b/src/ascendcc.in new file mode 100755 index 000000000..27d420242 --- /dev/null +++ b/src/ascendcc.in @@ -0,0 +1,125 @@ +#!/bin/bash + +# +# Copyright 2021 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if [ -z "${ASCEND_TOOLKIT_INSTALL_PATH}" ]; then + echo "Please set ASCEND_TOOLKIT_INSTALL_PATH" + echo ' - for example, issue: export ASCEND_TOOLKIT_INSTALL_PATH="/home/yzelman/Packages/CANN/x86_64/ascend-toolkit/latest/"' + exit 255 +fi + +#example path: +#ASCEND_TOOLKIT_INSTALL_PATH="/home/yzelman/Packages/CANN/ascend-toolkit/latest/" + +if [ ! -d "${ASCEND_TOOLKIT_INSTALL_PATH}" ]; then + echo "Error: ASCEND_TOOLKIT_INSTALL_PATH (${ASCEND_TOOLKIT_INSTALL_PATH}) does not exist" + exit 255 +fi + +ASCEND_TOOLKIT_INSTALL_PATH=${ASCEND_TOOLKIT_INSTALL_PATH}/ +echo "Info: using ASCEND_TOOLKIT_INSTALL_PATH=${ASCEND_TOOLKIT_INSTALL_PATH}" + +ASCEND_COMPILER="${ASCEND_TOOLKIT_INSTALL_PATH}/compiler/ccec_compiler/bin/ccec" + +BACKENDS=( "910B" "910" ) +ASCEND_CXXFLAGS="-xcce -DTILING_KEY_VAR=0 -O2 -std=c++17 -fPIC -pthread" +ASCEND_CCECFLAGS_LIST=( "--cce-aicore-arch=dav-c220-cube -mllvm -cce-aicore-function-stack-size=16000 -mllvm -cce-aicore-record-overflow=false -mllvm -cce-aicore-addr-transform --cce-auto-sync" \ + "--cce-aicore-arch=dav-c100 --cce-auto-sync" ) +ASCEND_CCECFLAGS_LLIST=( "--cce-fatobj-link --cce-aicore-arch=dav-c220-cube" \ + "--cce-fatobj-link --cce-aicore-arch=dav-c100" ) +ASCEND_LFLAGS_LIST=( "-L${ASCEND_TOOLKIT_INSTALL_PATH}runtime/lib64 -L${ASCEND_TOOLKIT_INSTALL_PATH}tools/simulator/Ascend910B1/lib -L${ASCEND_TOOLKIT_INSTALL_PATH}tools/tikicpulib/lib/Ascend910B1 -lstdc++ -lruntime -lascendcl" \ + "-L${ASCEND_TOOLKIT_INSTALL_PATH}runtime/lib64 -L${ASCEND_TOOLKIT_INSTALL_PATH}tools/simulator/Ascend910A/lib -L${ASCEND_TOOLKIT_INSTALL_PATH}tools/tikicpulib/lib/Ascend910A -lstdc++ -lruntime -lascendcl" ) +ASCEND_IFLAGS="-I${ASCEND_TOOLKIT_INSTALL_PATH}acllib/include -I${ASCEND_TOOLKIT_INSTALL_PATH}compiler/tikcpp/tikcfw -I${ASCEND_TOOLKIT_INSTALL_PATH}compiler/tikcpp/tikcfw/impl -I${ASCEND_TOOLKIT_INSTALL_PATH}compiler/tikcpp/tikcfw/interface -I${ASCEND_TOOLKIT_INSTALL_PATH}tools/tikicpulib/lib/include" + +declare -a ARGS +SHOW="eval" +BACKEND=${BACKENDS[0]} +LINK=true + +while [[ $# -gt 0 ]]; do + option="$1" + shift; + case ${option} in + -b|--backend) + BACKEND=$1 + shift + ;; + -c) + ARGS+=("-c") + LINK=false + ;; + --show) + SHOW="echo" + ;; + --version) + echo "This is ALP/Ascend" + echo " " + echo "This software comes with NO warranty; not even for" + echo "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + echo "See the license file for details." + echo " " + ARGS+=("${option}") + ;; + --) + break + ;; + *) + ARGS+=( "${option}" ) + ;; + esac +done + +BACKENDID=-1 +for i in "${!BACKENDS[@]}"; do + if [[ "${BACKENDS[$i]}" = "${BACKEND}" ]] + then + BACKENDID=$i + break + fi +done + +echo "Info: compiling for ${BACKEND}" + +if [[ ${BACKENDID} -eq -1 ]] +then + echo "Could not find requested backend \`${BACKEND}'" + exit 255 +fi + +ASCEND_CCECFLAGS=${ASCEND_CCECFLAGS_LIST[$i]} + +if ${LINK} +then + CMD="${ASCEND_COMPILER} ${ASCEND_CCECFLAGS_LLIST[$i]} "${ARGS[@]}" "$@" ${ASCEND_LFLAGS_LIST[$i]}" +else + CMD="${ASCEND_COMPILER} ${ASCEND_CXXFLAGS} ${ASCEND_CCECFLAGS} ${ASCEND_IFLAGS} "${ARGS[@]}" "$@" ${LFLAGS}" +fi + +${SHOW} "${CMD}" + +#Ascend 910B, -c: +#/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec -xcce -DTILING_KEY_VAR=0 -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/acllib/include" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw/impl" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw/interface" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/tikicpulib/lib/include" -O2 -std=c++17 --cce-aicore-arch=dav-c220-cube -mllvm -cce-aicore-function-stack-size=16000 -mllvm -cce-aicore-record-overflow=false -mllvm -cce-aicore-addr-transform --cce-auto-sync -fPIC -pthread -o CMakeFiles/add_custom_npu.dir/__/__/add_custom.cpp.o -c /home/yzelman/Documents/Ascend/samples-master/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/kernel_invocation/Add/add_custom.cpp + +#Ascend 910B, link: +#/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec --cce-fatobj-link --cce-aicore-arch=dav-c220-cube CMakeFiles/add_custom_npu.dir/__/__/add_custom.cpp.o CMakeFiles/add_custom_npu.dir/__/__/main.cpp.o -o ../../../add_custom_npu -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/runtime/lib64" -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/simulator/Ascend910B1/lib" -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/tikicpulib/lib/Ascend910B1" -lstdc++ -lruntime -lascendcl + +#Ascend 910, -c: +#cd /home/yzelman/Documents/Ascend/samples-master/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/kernel_invocation/Add/build/cmake/npu && /home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec -xcce -DTILING_KEY_VAR=0 -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/acllib/include" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw/impl" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/tikcpp/tikcfw/interface" -I"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/tikicpulib/lib/include" -O2 -std=c++17 --cce-aicore-arch=dav-c100 --cce-auto-sync -fPIC -pthread -o CMakeFiles/add_custom_npu.dir/__/__/add_custom.cpp.o -c /home/yzelman/Documents/Ascend/samples-master/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/kernel_invocation/Add/add_custom.cpp + +#Ascend 910, link: +#/home/yzelman/Packages/CANN/ascend-toolkit/latest/compiler/ccec_compiler/bin/ccec --cce-fatobj-link --cce-aicore-arch=dav-c100 CMakeFiles/add_custom_npu.dir/__/__/add_custom.cpp.o CMakeFiles/add_custom_npu.dir/__/__/main.cpp.o -o ../../../add_custom_npu -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/runtime/lib64" -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/simulator/Ascend910A/lib" -L"/home/yzelman/Packages/CANN/ascend-toolkit/latest/tools/tikicpulib/lib/Ascend910A" -lstdc++ -lruntime -lascendcl + diff --git a/src/graphblas/CMakeLists.txt b/src/graphblas/CMakeLists.txt index 98924080e..d1692fd3d 100644 --- a/src/graphblas/CMakeLists.txt +++ b/src/graphblas/CMakeLists.txt @@ -93,6 +93,10 @@ if( WITH_NONBLOCKING_BACKEND ) add_subdirectory( nonblocking ) endif() +if( WITH_ASCEND_BACKEND ) + add_subdirectory( ascend ) +endif() + if( WITH_BSP1D_BACKEND OR WITH_HYBRID_BACKEND ) add_subdirectory( bsp1d ) endif() @@ -151,6 +155,9 @@ if( WITH_REFERENCE_BACKEND OR WITH_OMP_BACKEND OR WITH_NONBLOCKING_BACKEND ) if( WITH_NONBLOCKING_BACKEND ) target_link_libraries( backend_shmem_static PRIVATE ${backend_nonblocking_headers} ) endif() + if( WITH_ASCEND_BACKEND ) + target_link_libraries( backend_shmem_static PRIVATE ${backend_ascend_headers} ) + endif() # this is the actual binary file, i.e. the one to be installed install( TARGETS backend_shmem_static EXPORT GraphBLASTargets @@ -163,6 +170,9 @@ if( WITH_REFERENCE_BACKEND OR WITH_OMP_BACKEND OR WITH_NONBLOCKING_BACKEND ) if( WITH_NONBLOCKING_BACKEND ) target_link_libraries( backend_shmem_shared PRIVATE ${backend_nonblocking_headers} ) endif() + if( WITH_ASCEND_BACKEND ) + target_link_libraries( backend_shmem_shared PRIVATE ${backend_ascend_headers} ) + endif() install( TARGETS backend_shmem_shared EXPORT GraphBLASTargets LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}" diff --git a/src/graphblas/ascend/CMakeLists.txt b/src/graphblas/ascend/CMakeLists.txt new file mode 100644 index 000000000..aa3462eed --- /dev/null +++ b/src/graphblas/ascend/CMakeLists.txt @@ -0,0 +1,46 @@ +# +# Copyright 2021 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Creation of the nonblocking backend, both as static and dynamic library. Any target +# importing a backend also imports the compiler definition(s) required to set it as +# default. If a target want to do it manually, the target 'backend_shmem_static' exists +# with no default backend selection in its compilation interface. +# + +assert_valid_variables( SHMEM_BACKEND_INSTALL_DIR + ASCEND_BACKEND_DEFAULT_NAME ASCEND_SELECTION_DEFS +) + +assert_defined_variables( backend_reference_srcs ) + + +set( backend_reference_srcs ${backend_reference_srcs} + ${CMAKE_CURRENT_SOURCE_DIR}/opgen.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/semantics.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/init.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/io.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/grid.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/symbolTable.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operators.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/lazy_evaluation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tensor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/stage.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp + PARENT_SCOPE +) + diff --git a/src/graphblas/ascend/grid.cpp b/src/graphblas/ascend/grid.cpp new file mode 100644 index 000000000..1cc463b6e --- /dev/null +++ b/src/graphblas/ascend/grid.cpp @@ -0,0 +1,81 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include + +#include + +namespace alp { + + namespace internal { + + iGrid *igrid; + } +} + +alp::internal::iGrid::iGrid( size_t proc, size_t prob ) { + + process_order = proc; + problem_order = prob; +} + +size_t alp::internal::iGrid::getProcessOrder() const noexcept { + + return process_order; +} + +size_t alp::internal::iGrid::getProblemOrder() const noexcept { + + return problem_order; +} + +std::string alp::internal::iGrid::processSize( const size_t k ) const noexcept { + + return "p" + std::to_string( k ); +} + +std::string alp::internal::iGrid::processMode( const size_t k ) const noexcept { + + return "a" + std::to_string( k ); +} + +std::string alp::internal::iGrid::problemSize( const size_t k ) const noexcept { + + return "n" + std::to_string( k ); +} + +std::string alp::internal::iGrid::problemMode( const size_t k ) const noexcept { + + return "i" + std::to_string( k ); +} + +std::string alp::internal::iGrid::problemMainMode( const size_t k ) const noexcept { + + return "z" + std::to_string( k ); +} + +std::string alp::internal::iGrid::problemTileMode( const size_t k ) const noexcept { + + return "t" + std::to_string( k ); +} + +std::string alp::internal::iGrid::tileSize( const size_t k ) const noexcept { + + return "tile_size" + std::to_string( k ); +} + diff --git a/src/graphblas/ascend/init.cpp b/src/graphblas/ascend/init.cpp new file mode 100644 index 000000000..9926d8238 --- /dev/null +++ b/src/graphblas/ascend/init.cpp @@ -0,0 +1,59 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Provides initialisation for the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#include + +#include + +#include +#include + +#include + +#include + + +template<> +grb::RC grb::init< grb::ascend >( + const size_t s, const size_t P, void * const data +) { + // If the environment variable GRB_ASCEND_TILE_SIZE is set, a fixed + // tile size is used for all pipelines built during the ascend execution. + // Therefore, the choice is manual. Otherwise, the choice is automatically + // made at run-time by the analytic model and may differ for different + // pipelines. + std::cerr << "Info: grb::init (ascend) called.\n"; + return grb::init< grb::reference >( s, P, data ); +} + +template<> +grb::RC grb::finalize< grb::ascend >() { + std::cerr << "Info: grb::finalize (ascend) called.\n"; + std::cerr << "Info: codegen will go to std::cout (TODO)\n"; +// alp::internal::OpGen::generate( std::cout ); + return grb::finalize< grb::reference >(); +} + diff --git a/src/graphblas/ascend/io.cpp b/src/graphblas/ascend/io.cpp new file mode 100644 index 000000000..49b19e4d9 --- /dev/null +++ b/src/graphblas/ascend/io.cpp @@ -0,0 +1,50 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Implements the grb::wait for the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#include + +#include + + +namespace grb { + + namespace internal { + + extern LazyEvaluation le; + + } + + /** + * \internal This is a ascend implementation, and all + * pending operations must be completed. + */ + template<> + RC wait< ascend >() { + return internal::le.execution(); + } + +} + diff --git a/src/graphblas/ascend/lazy_evaluation.cpp b/src/graphblas/ascend/lazy_evaluation.cpp new file mode 100644 index 000000000..480f46158 --- /dev/null +++ b/src/graphblas/ascend/lazy_evaluation.cpp @@ -0,0 +1,121 @@ +#include +#include + + +namespace alp +{ + namespace internal + { + AscendLazyEvaluation ale; + } +} + +alp::internal::AscendLazyEvaluation::AscendLazyEvaluation() { + + num_pipelines = 0; + addPipeline(); //TODO add the first pipeline +} + +void alp::internal::AscendLazyEvaluation::addPipeline() { + + pipelines.emplace_back( AscendPipeline( num_pipelines ) ); + num_pipelines++; +} + +void alp::internal::AscendLazyEvaluation::insertFreeInputTensorStages( const std::vector< int > &forEachAxes ) { + + for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) { + it->insertFreeInputTensorStages( forEachAxes ); + } +} + +const alp::Tensor &alp::internal::AscendLazyEvaluation::store( const alp::Tensor &output_tensor ) { + + //TODO: perhaps data dependence analysis will determine the right pipeline + auto pipeline = pipelines.rbegin(); + return pipeline->store( output_tensor ); +} + +void alp::internal::AscendLazyEvaluation::clear() { + + for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) { + it->clear(); + } +} + +void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const double alpha, const std::vector< int > &forEachAxes ) { + + //TODO: perhaps data dependence analysis will determine the right pipeline + auto pipeline = pipelines.rbegin(); + pipeline->addStage( op_type, rule, tensor1, alpha, forEachAxes ); +} + +void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) { + + //TODO: perhaps data dependence analysis will determine the right pipeline + auto pipeline = pipelines.rbegin(); + pipeline->addStage( op_type, rule, tensor1, activeAxes, forEachAxes ); +} + +void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const alp::Tensor &tensor2, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) { + + //TODO: perhaps data dependence analysis will determine the right pipeline + auto pipeline = pipelines.rbegin(); + pipeline->addStage( op_type, rule, tensor1, tensor2, activeAxes, forEachAxes ); +} + +void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const alp::Tensor &tensor2, const alp::Tensor &tensor3, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) { + + //TODO: perhaps data dependence analysis will determine the right pipeline + auto pipeline = pipelines.rbegin(); + pipeline->addStage( op_type, rule, tensor1, tensor2, tensor3, activeAxes, forEachAxes ); +} +/* +void alp::internal::AscendLazyEvaluation::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, const alp::Tensor &tensor1, const alp::Tensor &tensor2, const alp::Tensor &tensor3, const alp::Tensor &tensor4, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) { + + //TODO: perhaps data dependence analysis will determine the right pipeline + auto pipeline = pipelines.rbegin(); + pipeline->addStage( op_type, rule, tensor1, tensor2, tensor3, tensor4, activeAxes, forEachAxes ); +} +*/ +void alp::internal::AscendLazyEvaluation::generateDeclarations( std::stringstream &declarations ) { + + for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) { + it->generateDeclarations( declarations ); + } +} +/* +void alp::internal::AscendLazyEvaluation::generateConstructor( std::stringstream &constructor ) { + for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) { + it->generateConstructor( constructor ); + } +} +*/ +void alp::internal::AscendLazyEvaluation::generateHostBody( std::stringstream &os, std::stringstream &analyticModelArgs, + std::stringstream &analyticModelFormalParams, std::stringstream &analyticModelDecls, + std::stringstream &analyticModelConstrBody ) { + for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) { + it->generateHostBody( os, analyticModelArgs, analyticModelFormalParams, analyticModelDecls, analyticModelConstrBody ); + } +} + +void alp::internal::AscendLazyEvaluation::generateInit( std::stringstream &init ) { + + for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) { + it->generateInit( init ); + } +} + +void alp::internal::AscendLazyEvaluation::generateProcess( std::stringstream &process, std::stringstream &processCall ) { + + for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) { + it->generateProcess( process, processCall ); + } +} + +void alp::internal::AscendLazyEvaluation::debug_print() const { + + for( auto it = pipelines.begin(); it != pipelines.end(); ++it ) { + it->debug_print(); + } +} diff --git a/src/graphblas/ascend/operators.cpp b/src/graphblas/ascend/operators.cpp new file mode 100644 index 000000000..a87d9af3b --- /dev/null +++ b/src/graphblas/ascend/operators.cpp @@ -0,0 +1,200 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace alp +{ + namespace internal + { + extern AscendLazyEvaluation ale; + } +} + +namespace alp +{ + Tensor getView( const Tensor &parent ) { + + std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); + std::vector< int > difference_axes = internal::vectorDifference( parent.getAxes(), forEachAxes ); + + Tensor ret_view( parent, difference_axes ); + + internal::Rule rule = internal::Rule::NONE; + + alp::internal::ale.addStage( alp::internal::Stagetype::GET_VIEW, rule, parent, difference_axes, forEachAxes ); + + return ret_view; + } + + // TODO extend to multiple containers + void store( const Tensor &output ) { + + const alp::Tensor &parent = alp::internal::ale.store( output ); + + std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); + std::vector< int > difference_axes = internal::vectorDifference( parent.getAxes(), forEachAxes ); + + internal::Rule rule = internal::Rule::NONE; + + alp::internal::ale.addStage( alp::internal::Stagetype::STORE, rule, parent, difference_axes, forEachAxes ); + } + + void set( + Tensor &tout, + Tensor &tin, + const std::vector< int > &activeAxes + ) { + std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); + + internal::Rule rule = internal::Rule::NONE; + + alp::internal::ale.addStage( alp::internal::Stagetype::SET_TENSOR, rule, tout, tin, activeAxes, forEachAxes ); + } + + void set( + Tensor &tout, + double alpha //TODO perhaps use a templated datatype instead of double + ) { + std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); + + internal::Rule rule = internal::Rule::NONE; + + alp::internal::ale.addStage( alp::internal::Stagetype::SET_SCALAR, rule, tout, alpha, forEachAxes ); + } + + void apply( + Tensor &tout, + Tensor &tin1, + Tensor &tin2, + const std::string &opName, + const std::vector< int > &activeAxes + ) { + std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); +/* + std::vector< int > union_axes = internal::vectorUnion( tout.getAxes(), tin1.getAxes() ); + union_axes = internal::vectorUnion( union_axes, tin2.getAxes() ); + + assert( union_axes.size() < 3 ); + + std::vector< int > temp_axes; + if( union_axes.size() == 1 ) { + temp_axes.push_back( union_axes[ 0 ] ); + } else if ( union_axes.size() == 2 ) { + temp_axes.push_back( union_axes[ 1 ] ); + } + + // create a temporary Tensor + Tensor temp( temp_axes, tout.getType() ); +*/ + internal::Rule rule = internal::Rule::NONE; + + //TODO the current design does not make a distinction between the different cases + // of BCAST and REDUCE, this should be fixed in a later version + if( tin1.getAxes() == tin2.getAxes() && tout.getAxes() == tin1.getAxes() ) { + rule = internal::Rule::EWISE; + } else if ( tin1.getAxes() == tin2.getAxes() && internal::vectorSubset( tout.getAxes(), tin1.getAxes() ) == true ) { + rule = internal::Rule::REDUCE; + } else if ( tin1.getAxes() == tin2.getAxes() && internal::vectorSubset( tin1.getAxes(), tout.getAxes() ) == true ) { + rule = internal::Rule::BCAST; + } else if ( tin1.getAxes() == tout.getAxes() && internal::vectorSubset( tout.getAxes(), tin2.getAxes() ) == true ) { + rule = internal::Rule::REDUCE; + } else if ( tin1.getAxes() == tout.getAxes() && internal::vectorSubset( tin2.getAxes(), tout.getAxes() ) == true ) { + rule = internal::Rule::BCAST; + } else if ( tin2.getAxes() == tout.getAxes() && internal::vectorSubset( tout.getAxes(), tin1.getAxes() ) == true ) { + rule = internal::Rule::REDUCE; + } else if ( tin2.getAxes() == tout.getAxes() && internal::vectorSubset( tin1.getAxes(), tout.getAxes() ) == true ) { + rule = internal::Rule::BCAST; + } else if ( tin1.getAxes() != tin2.getAxes() && tin1.getAxes() != tout.getAxes() && tin2.getAxes() != tout.getAxes() ) { + if( internal::vectorSubset( tout.getAxes(), tin1.getAxes() ) == true && internal::vectorSubset( tout.getAxes(), tin2.getAxes() ) == true ) { + rule = internal::Rule::BCAST; + } else if( internal::vectorSubset( tin1.getAxes(), tout.getAxes() ) == true && internal::vectorSubset( tin2.getAxes(), tout.getAxes() ) == true ) { + rule = internal::Rule::REDUCE; + } else { + std::cerr << "The axes of the output tensor cannot be subset of the axes of one input and superset of the axes of the other input: apply " << opName << std::endl; + std::abort(); + } + } + + if( opName == "minus" ) { + alp::internal::ale.addStage( alp::internal::Stagetype::APPLY_MINUS, rule, tout, tin1, tin2, activeAxes, forEachAxes ); + } else if( opName == "add" ) { + alp::internal::ale.addStage( alp::internal::Stagetype::APPLY_ADD, rule, tout, tin1, tin2, activeAxes, forEachAxes ); + } + else { + + } + } + + void foldl( + Tensor &tinout, + Tensor &tin, + const std::string &opName, + const std::vector< int > &activeAxes + ) { + std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); +/* + std::vector< int > union_axes = internal::vectorUnion( tinout.getAxes(), tin.getAxes() ); + + assert( union_axes.size() < 3 ); + + std::vector< int > temp_axes; + if( union_axes.size() == 1 ) { + temp_axes.push_back( union_axes[ 0 ] ); + } else if ( union_axes.size() == 2 ) { + temp_axes.push_back( union_axes[ 1 ] ); + } + + // create a temporary Tensor + Tensor temp( temp_axes, tinout.getType() ); +*/ + internal::Rule rule = internal::Rule::NONE; + + if( tinout.getAxes() == tin.getAxes() ) { + rule = internal::Rule::EWISE; + } else if ( internal::vectorSubset( tinout.getAxes(), tin.getAxes() ) == true ) { + rule = internal::Rule::REDUCE; + } else if ( internal::vectorSubset( tin.getAxes(), tinout.getAxes() ) == true ) { + rule = internal::Rule::BCAST; + } else { + + } + + if( opName == "divide" ) { + alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_DIVIDE, rule, tinout, tin, activeAxes, forEachAxes ); + } else if( opName == "max" ) { + alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_MAX, rule, tinout, tin, activeAxes, forEachAxes ); + } else if( opName == "times" ) { + alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_TIMES, rule, tinout, tin, activeAxes, forEachAxes ); + } else if( opName == "add" ) { + alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_ADD, rule, tinout, tin, activeAxes, forEachAxes ); + } else { + + } + } + +// template< size_t sm, size_t pm > + void foldl( +// const Grid< sm, pm > &grid, + Tensor &tinout, + const std::string &opName, + const std::vector< int > &activeAxes + ) { + std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); + + internal::Rule rule = internal::Rule::NONE; + + if( opName == "exp" ) { + alp::internal::ale.addStage( alp::internal::Stagetype::FOLDL_EXP, rule, tinout, activeAxes, forEachAxes ); + } else { + + } + } + +} + diff --git a/src/graphblas/ascend/opgen.cpp b/src/graphblas/ascend/opgen.cpp new file mode 100644 index 000000000..bb3c6370f --- /dev/null +++ b/src/graphblas/ascend/opgen.cpp @@ -0,0 +1,211 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file + * + * Provides CodeGen for the Ascend backend. + * + * @author A. N. Yzelman + * @date 12th of September, 2023 + */ + +#include +#include + +namespace alp +{ + namespace internal + { + extern SymbolTable symbols; + } +} + +std::string alp::internal::OpGen::kernel_id; + +size_t alp::internal::OpGen::forEachLevel = 0; + +std::vector< std::vector< int > > alp::internal::OpGen::forEachAxes; +std::vector< int > alp::internal::OpGen::lastAxes; + +//TODO how is this supposed to be used? +//std::map< std::string, std::string > alp::internal::OpGen::chunkSize; + +std::stringstream alp::internal::OpGen::aux_func; +std::stringstream alp::internal::OpGen::analyticModelFormalParams; +std::stringstream alp::internal::OpGen::hostFormalParam; +std::stringstream alp::internal::OpGen::hostArg; +std::stringstream alp::internal::OpGen::constrBody; +std::stringstream alp::internal::OpGen::hostBody; +std::stringstream alp::internal::OpGen::classMembers; +std::stringstream alp::internal::OpGen::initBody; +std::stringstream alp::internal::OpGen::genericProcessBody; +std::stringstream alp::internal::OpGen::declarations; + +std::vector< std::stringstream > alp::internal::OpGen::processFunc; +std::vector< std::stringstream > alp::internal::OpGen::computeFunc; +std::vector< std::stringstream > alp::internal::OpGen::copyinFunc; +std::vector< std::stringstream > alp::internal::OpGen::copyoutFunc; + +void alp::internal::OpGen::compileClear() { + +// chunkSize.clear(); //TODO how is this supposed to be used? + + forEachAxes.clear(); + + alp::internal::symbols.clearAll(); + + aux_func.clear(); + analyticModelFormalParams.clear(); + hostFormalParam.clear(); + hostArg.clear(); + constrBody.clear(); + classMembers.clear(); + initBody.clear(); + genericProcessBody.clear(); + declarations.clear(); + + processFunc.clear(); + computeFunc.clear(); + copyinFunc.clear(); + copyoutFunc.clear(); +} + +// TODOs: +// +// 1. rely on lazy evaluation (le)'s pipelines instead of our own input, output and stages info +// 2. use streams (such as the above initStreams) to generate the content of each component separately. +// - for example, the parameters to the init function could be gathered in initStream; etc. +// 3. instead of generating the copyIn, copyOut, and compute of the add operation directly, rely on a +// library of AscendC kernels +// - for example, we should generate in the case of a fused sequence of operators OP1 and OP2: +// __aicore__ inline void CopyIn( int32_t progress ) { +// OP1::CopyIn( progress, ... ); +// OP2::CopyIn( progress, ... ); +// // and so on for further fused ops +// } +// 4. Instead of hardcoding half as the data type, we should generate it appropriately from the typename T +void alp::internal::OpGen::generate( std::ostream &os ) { + os << "\n// start automatic ALP/Ascend opgen\n\n"; + + os << "#include \n\n"; + os << "#include \"ascendlib.hpp\"\n\n"; + os << "using namespace AscendC;\n\n"; + + // TODO this should be generated by the grid.forEach + // TODO should we support both following modes? + // mode 1: symbolic, the below are parameters to the call to custom_kernels, cannot be constexpr(!) + // mode 2: the user passes explicit parameters into alp::Grid, then instead of symbolic output, actual values are emitted as in below + os << "constexpr int32_t BUFFER_NUM = 1; \n"; // TODO TBC indicates local buffer space in a pipe? + os << "\n"; + + os << aux_func.str(); + os << "\n"; + + std::stringstream initFormalParam; + std::stringstream customFormalParam; + std::stringstream allAccessedArg; + std::stringstream allTempLocalDecl; + + alp::internal::symbols.generateGlobalSymbols( initFormalParam, customFormalParam, allAccessedArg, allTempLocalDecl ); + + // generate class header + os << "class " << kernel_id << " {\n\n"; + os << "\tpublic:\n\n"; + + // generate class constructor + os << "\t\t__aicore__ inline " << kernel_id << "( "; + os << hostFormalParam.str(); + os << analyticModelFormalParams.str(); + os << " ) {\n"; + os << constrBody.str(); + os << "\t\t}\n\n\n"; + + // generate init function + os << "\t\t__aicore__ inline void Init( "; + os << initFormalParam.str(); + os << " ) {\n"; + os << initBody.str(); + os << "\t\t}\n\n"; + + // insert the Process functions + for( auto it = processFunc.cbegin(); it != processFunc.cend(); ++it ) { + os << it->str(); + os << "\n\n"; + } + + // generate the generic Process functions + os << "\t\t__aicore__ inline void Process() {\n"; + os << "\n"; + os << genericProcessBody.str(); + os << "\t\t}\n\n\n"; + + // declare private fields + os << "\tprivate:\n\n"; + os << classMembers.str(); +// os << "\n"; + os << "\t\tTPipe pipe;\n"; + os << "\n"; + os << declarations.str(); + os << allTempLocalDecl.str(); + // end of class + os << "};\n\n"; + + // generate entry function + os << "extern \"C\" __global__ __aicore__ void custom_" << kernel_id << "(\n\t"; + //print the list of all input and output vectors for the arguments list + os << initFormalParam.str(); + os << ",\n\t"; + os << hostFormalParam.str(); + os << analyticModelFormalParams.str(); + os << "\n) {\n"; + os << "\t" << kernel_id << " op( "; + os << hostArg.str(); + os << " );\n"; + os << "\top.Init( "; + os << allAccessedArg.str(); + os << " );\n"; + os << "\top.Process();\n"; + os << "}\n\n"; + + // TODO do we absolutely need to generate the host entry point here? + os << "#ifndef __CCE_KT_TEST__\n"; + os << "void custom_" << kernel_id << "_do(\n" + << "\tuint32_t blockDim,\n" + << "\tvoid *l2ctrl,\n" + << "\tvoid *stream,\n\t"; + os << customFormalParam.str(); + os << ",\n\t"; + os << hostFormalParam.str(); + os << analyticModelFormalParams.str(); + os << "\n) {\n"; + + // generate analytic model +// os << hostBody.str(); + + // generate entry point + os << "\tcustom_" << kernel_id << "<<< blockDim, l2ctrl, stream >>>( "; + os << allAccessedArg.str(); + os << ", "; + os << hostArg.str(); + os << " );\n"; + os << "}\n"; + os << "#endif\n\n"; + + os << "// end automatic ALP/Ascend opgen\n\n"; +} + diff --git a/src/graphblas/ascend/pipeline.cpp b/src/graphblas/ascend/pipeline.cpp new file mode 100644 index 000000000..937d9d3ba --- /dev/null +++ b/src/graphblas/ascend/pipeline.cpp @@ -0,0 +1,998 @@ +#include + +#include +#include +#include +#include +#include + +namespace alp +{ + namespace internal + { + extern iGrid *igrid; + extern SymbolTable symbols; + } +} + +alp::internal::AscendPipeline::AscendPipeline( size_t _id ) : id( _id ) +{ + +} + +void alp::internal::AscendPipeline::insertTensorToInputs( const alp::Tensor &tensor ) +{ + accessed.insert( alp::internal::symbols.getTensorFromView( tensor ) ); +} + +void alp::internal::AscendPipeline::insertFreeInputTensorStages( const std::vector< int > &forEachAxes ) +{ + std::vector< alp::internal::Stage * > st; + + // search for all GET_VIEW stages in the pipeline and store them + for( auto it = stages.begin(); it != stages.end(); ++it ) { + if (it->getOpType() == internal::Stagetype::GET_VIEW ) { + st.push_back( &(*it) ); + } + } + + // search for all STORE stages in the pipeline and delete + // the corresponding GET_VIEW from those stored above + for( auto it = stages.begin(); it != stages.end(); ++it ) { + if (it->getOpType() == internal::Stagetype::STORE ) { + for( auto jt = st.begin(); jt != st.end(); ) { + if( (*jt)->getTensor0().getID() == it->getTensor0().getID() ) { + jt = st.erase( jt ); + } else { + ++jt; + } + } + } + } + + // for the remaining GET_VIEW stages that are still stored + // insert a new stage in the end of the pipeline that + // corresponds to all input tensors for which store + // is not explicitly invoked by the user + for( auto it = st.begin(); it != st.end(); ++it ) { + if( (*it)->getForEachAxes() == forEachAxes ) { + addStage( alp::internal::Stagetype::IMPLICIT_FREE, (*it)->getRule(), (*it)->getTensor0(), (*it)->getAxes(), (*it)->getForEachAxes() ); + } + } +} + +std::set< int > alp::internal::AscendPipeline::getIteratedAxes() const { + std::vector< int > union_iterated_axes; + + for( auto it = stages.begin(); it != stages.end(); ++it ) { + union_iterated_axes = internal::vectorUnion( union_iterated_axes, it->getForEachAxes() ); + } + + // convert the std::vector to std::set + std::set< int > ret; + ret.insert( union_iterated_axes.begin(), union_iterated_axes.end() ); + return ret; +} + +const alp::Tensor &alp::internal::AscendPipeline::store( const alp::Tensor &output_tensor ) { + + //FIXME I should check here that this is indeed a VIEW + + const alp::Tensor &parent = alp::internal::symbols.getTensorFromView( output_tensor ); + outputs.insert( parent ); + + alp::internal::symbols.addOutputTensor( parent ); + + return parent; +} + +bool alp::internal::AscendPipeline::isOutput( const alp::Tensor &tensor ) const { + + //FIXME I should check here that this is indeed a VIEW + + return outputs.find( tensor ) != outputs.end(); +} + +void alp::internal::AscendPipeline::clear() { + + stages.clear(); + accessed.clear(); + outputs.clear(); +} + +size_t alp::internal::AscendPipeline::getID() const { + + return id; +} + +std::string alp::internal::AscendPipeline::getTilingAxes() const { + + std::string tiling_init_numerator; + + const std::set< int > iterated_axes = getIteratedAxes(); + + for( auto it = iterated_axes.begin(); it != iterated_axes.end(); ++it ) { + tiling_init_numerator.append( igrid->tileSize( *it ) ); + tiling_init_numerator.append( " * " ); + } + + return tiling_init_numerator; +} + +void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, + const alp::Tensor &tensor1, const double alpha, const std::vector< int > &forEachAxes ) { + + // insert the Tensor to the set of accessed data + insertTensorToInputs( tensor1 ); + // get the name of the Tensor object that exists behind this view or tensor + + switch ( op_type ) { + + case alp::internal::Stagetype::SET_SCALAR: + { + stages.push_back( std::move( alp::internal::Stage( *this, + alp::internal::Stagetype::SET_SCALAR, rule, tensor1, alpha, forEachAxes ) ) ); + + break; + } + case alp::internal::Stagetype::GET_VIEW: + case alp::internal::Stagetype::STORE: + case alp::internal::Stagetype::IMPLICIT_FREE: + { + std::cerr << "Stage: " << (int) op_type << " has only one tensor argument" << std::endl; + std::abort(); + break; + } + case alp::internal::Stagetype::FOLDL_EXP: + case alp::internal::Stagetype::SET_TENSOR: + case alp::internal::Stagetype::APPLY_ADD: + case alp::internal::Stagetype::APPLY_MINUS: + case alp::internal::Stagetype::FOLDL_DIVIDE: + case alp::internal::Stagetype::FOLDL_MAX: + case alp::internal::Stagetype::FOLDL_TIMES: + case alp::internal::Stagetype::FOLDL_ADD: + { + std::cerr << "Stage: " << (int) op_type << " has more than one tensor arguments" << std::endl; + std::abort(); + break; + } + } +} + +void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, + const alp::Tensor &tensor1, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) { + + // insert the Tensor to the set of accessed data + insertTensorToInputs( tensor1 ); + // get the name of the Tensor object that exists behind this view or tensor + + switch ( op_type ) { + + case alp::internal::Stagetype::FOLDL_EXP: + case alp::internal::Stagetype::GET_VIEW: + case alp::internal::Stagetype::STORE: + case alp::internal::Stagetype::IMPLICIT_FREE: + { + stages.push_back( std::move( alp::internal::Stage( *this, + op_type, rule, tensor1, activeAxes, forEachAxes ) ) ); + + break; + } + case alp::internal::Stagetype::SET_SCALAR: + case alp::internal::Stagetype::SET_TENSOR: + case alp::internal::Stagetype::APPLY_ADD: + case alp::internal::Stagetype::APPLY_MINUS: + case alp::internal::Stagetype::FOLDL_DIVIDE: + case alp::internal::Stagetype::FOLDL_MAX: + case alp::internal::Stagetype::FOLDL_TIMES: + case alp::internal::Stagetype::FOLDL_ADD: + { + std::cerr << "Stage: " << (int) op_type << " has more than one tensor arguments" << std::endl; + std::abort(); + break; + } + } +} + +void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, + const alp::Tensor &tensor1, const alp::Tensor &tensor2, + const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) { + + // insert the Tensors to the set of accessed data + insertTensorToInputs( tensor1 ); //TODO pass the string + insertTensorToInputs( tensor2 ); + + switch ( op_type ) { + + case alp::internal::Stagetype::SET_TENSOR: + case alp::internal::Stagetype::FOLDL_MAX: + case alp::internal::Stagetype::FOLDL_TIMES: + case alp::internal::Stagetype::FOLDL_ADD: + case alp::internal::Stagetype::FOLDL_DIVIDE: + { + stages.push_back( std::move( alp::internal::Stage( *this, + op_type, rule, tensor1, tensor2, activeAxes, forEachAxes ) ) ); + break; + } + case alp::internal::Stagetype::APPLY_ADD: + case alp::internal::Stagetype::APPLY_MINUS: + case alp::internal::Stagetype::FOLDL_EXP: + case alp::internal::Stagetype::SET_SCALAR: + case alp::internal::Stagetype::GET_VIEW: + case alp::internal::Stagetype::STORE: + case alp::internal::Stagetype::IMPLICIT_FREE: + { + std::cerr << "Stage: " << (int) op_type << " does not have two tensor arguments" << std::endl; + std::abort(); + break; + } + } +} + +void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, + const alp::Tensor &tensor1, const alp::Tensor &tensor2, + const alp::Tensor &tensor3, const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) { + + // insert the Tensors to the set of accessed data + insertTensorToInputs( tensor1 ); //TODO pass the string + insertTensorToInputs( tensor2 ); + insertTensorToInputs( tensor3 ); + + switch ( op_type ) { + + case alp::internal::Stagetype::APPLY_MINUS: + case alp::internal::Stagetype::APPLY_ADD: + { + stages.push_back( std::move( alp::internal::Stage( *this, + op_type, rule, tensor1, tensor2, tensor3, activeAxes, forEachAxes ) ) ); + break; + } + case alp::internal::Stagetype::FOLDL_DIVIDE: + case alp::internal::Stagetype::SET_TENSOR: + case alp::internal::Stagetype::FOLDL_MAX: + case alp::internal::Stagetype::FOLDL_TIMES: + case alp::internal::Stagetype::FOLDL_ADD: + case alp::internal::Stagetype::FOLDL_EXP: + case alp::internal::Stagetype::SET_SCALAR: + case alp::internal::Stagetype::GET_VIEW: + case alp::internal::Stagetype::STORE: + case alp::internal::Stagetype::IMPLICIT_FREE: + { + std::cerr << "Stage: " << (int) op_type << " does not have three tensor arguments" << std::endl; + std::abort(); + break; + } + } +} +/* +void alp::internal::AscendPipeline::addStage( alp::internal::Stagetype op_type, alp::internal::Rule rule, + const alp::Tensor &tensor1, const alp::Tensor &tensor2, + const alp::Tensor &tensor3, const alp::Tensor &tensor4, + const std::vector< int > &activeAxes, const std::vector< int > &forEachAxes ) { + + // insert the Tensors to the set of accessed data + insertTensorToInputs( tensor1 ); //TODO pass the string + insertTensorToInputs( tensor2 ); + insertTensorToInputs( tensor3 ); + insertTensorToInputs( tensor4 ); + + switch ( op_type ) { + + case alp::internal::Stagetype::APPLY_MINUS: + { + //TODO tensor4 is a temporary variable + + stages.push_back( std::move( alp::internal::Stage( *this, + op_type, rule, tensor1, tensor2, tensor3, tensor4, activeAxes, forEachAxes ) ) ); + break; + } + case alp::internal::Stagetype::FOLDL_DIVIDE: + case alp::internal::Stagetype::APPLY_ADD: + case alp::internal::Stagetype::SET_TENSOR: + case alp::internal::Stagetype::FOLDL_MAX: + case alp::internal::Stagetype::FOLDL_TIMES: + case alp::internal::Stagetype::FOLDL_ADD: + case alp::internal::Stagetype::FOLDL_EXP: + case alp::internal::Stagetype::SET_SCALAR: + case alp::internal::Stagetype::GET_VIEW: + case alp::internal::Stagetype::STORE: + case alp::internal::Stagetype::IMPLICIT_FREE: + { + std::cerr << "Stage: " << (int) op_type << " does not have four tensor arguments" << std::endl; + std::abort(); + break; + } + } +} +*/ +void alp::internal::AscendPipeline::generateDeclarations( + std::stringstream &declarations +) { + +// declarations << "\t\tuint32_t " << "block_length" << id << ";\n"; +// declarations << "\t\tuint32_t " << "tile_length" << id << ";\n"; +// declarations << "\n"; + + for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) { + if( it->isGlobalDecl() ) { + declarations << "\t\t// Global Tensor declaration\n"; + if( outputs.find( *it ) != outputs.end() ) { + // TQue< QuePosition::VECOUT, BUFFER_NUM > globalQue_tensor1_0; + declarations << "\t\tTQue< QuePosition::VECOUT, BUFFER_NUM > " + << it->getTQueBufName( id ) << ";\n"; + } else { + // TQue< QuePosition::VECIN, BUFFER_NUM > globalQue_tensor0_0; + declarations << "\t\tTQue< QuePosition::VECIN, BUFFER_NUM > " + << it->getTQueBufName( id ) << ";\n"; + } + // GlobalTensor< half > Gm_tensor0_0; + declarations << "\t\tGlobalTensor< " << internal::getDataType( it->getType() ) << " > " + << it->getAscendGlobalName( id ) << ";\n"; + // LocalTensor< half > Gm_local_tensor0_0; + declarations << "\t\tLocalTensor< " << internal::getDataType( it->getType() ) << " > " + << it->getAscendName( id ) << ";\n"; + } else if( it->isLocalDecl() ) { +/* declarations << "\t\t// Local Tensor declaration\n"; + // TBuf< QuePosition::VECCALC > localBuf_tensor4_0; + declarations << "\t\tTBuf< QuePosition::VECCALC > " + << it->getTQueBufName( id ) << ";\n"; + // LocalTensor< half > local_tensor4_0; + declarations << "\t\tLocalTensor< " << internal::getDataType( it->getType() ) << " > " + << it->getAscendName( id ) << ";\n"; +*/ + declarations << "\t\t// Offset for local Tensor declaration\n"; + declarations << "\t\tint32_t " << it->getAscendName( id ) << ";\n"; + } else if( it->isTempDecl() ) { +/* declarations << "\t\t// Temporary Tensor declaration\n"; + // TBuf< QuePosition::VECCALC > tempBuf_tensor5_0; + declarations << "\t\tTBuf< QuePosition::VECCALC > " + << it->getTQueBufName( id ) << ";\n"; + // LocalTensor< half > temp_tensor5_0; + declarations << "\t\tLocalTensor< " << internal::getDataType( it->getType() ) << " > " + << it->getAscendName( id ) << ";\n"; +*/ + declarations << "\t\t// Offset for temporary Tensor declaration\n"; + declarations << "\t\tint32_t " << it->getAscendName( id ) << ";\n"; + } + declarations << "\n"; + } +/* + if( temp_or_local_found == true ) { + declarations << "\t\t// Declaration of memory used for Local and Temporary tensor\n"; + declarations << "\t\tTBuf< QuePosition::VECCALC > " << "_temp_local;\n"; + declarations << "\t\tLocalTensor< " << "half" << " > " << "_temp_local_Buf;\n"; + declarations << "\n"; + } +*/ +} + +//void alp::internal::AscendPipeline::generateConstructor( std::stringstream &constructor ) { +/* + constructor << "\n"; + constructor << "\t\t\tblock_length" << id << " = ( "; + constructor << igrid->problemSize( 0 ); + for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) { + constructor << " * " << igrid->problemSize( i ); + } + constructor << " ) / ( "; + constructor << igrid->processSize( 0 ); + for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) { + constructor << " * " << igrid->processSize( i ); + } + constructor << " );\n"; + constructor << "\t\t\ttile_length" << id << " = ( "; + bool first = true; + for( size_t i = 0; i < igrid->getProblemOrder(); ++i ) { + //TODO this solution assumes that there is only one parallel axis, which is not true + // omit the problemSize variables for which the corresponding axes is defined in the parallel forEach + // we use the parallel axes of the first stage, any other stage can be used as well + // since all stages of the same pipeline have the same outer loop + if( stages.begin()->getForEachAxes()[ 0 ] != ( int ) i ) { + if( !first ) { + constructor << " * "; + } + constructor << igrid->problemSize( i ); + first = false; + } + } + constructor << " ) / " << "BUFFER_NUM;\n"; +*/ +//} + +void alp::internal::AscendPipeline::generateHostBody( std::stringstream &os, std::stringstream &analyticModelArgs, + std::stringstream &analyticModelFormalParams, std::stringstream &analyticModelDecls, + std::stringstream &analyticModelConstrBody ) { + // analytic model codeblock + constexpr size_t ub_size = grb::config::ASCEND_CACHE_HIERARCHY<>::UB_SIZE; + + // This is a symbolic analysis to find what the largest global tensors are. + // After this symbolic analysis, we will have generally identified multiple + // global tensors as candidates for being the largest. We're generally still + // not sure which of these will be the largest tensor at run-time. Therefore, + // there is still a final run-time component to find the largest tensor(s). + std::set< std::set< int > > largestGlobals; + std::vector< Tensor > minorTensors; + bool differingDynamicAxesPresent = false; + for( const auto &tensor : accessed ) { + if( tensor.getScope() == internal::Scope::GLOBAL ) { + // TODO FIXME think about a cheaper algorithm for computing this check + const auto ¤t = tensor.getAxes(); + assert( current.size() > 0 ); + // by default, register the current tensor (don't register only if symbolic + // analysis is sure it is smaller) + bool insert = true; + for( const auto &existing : largestGlobals ) { + if( existing.size() <= current.size() ) { + bool larger = true; + for( const unsigned int &axis : existing ) { + if( std::find( current.cbegin(), current.cend(), axis ) != current.cend() ) { + // in this case, static analysis cannot conclude that the current tensor + // is larger than this entry in largestGlobal -- check the next entry of + // largestGlobals instead + larger = false; + break; + } else { + // check if the differing axis is a dynamic one + if( getIteratedAxes().find( axis ) != getIteratedAxes().cend() ) { + differingDynamicAxesPresent = true; + } + } + } + if( larger ) { + // in this case, the current tensor is guaranteed larger than this entry + // in largestGlobals -- so remove this entry, then flag the current axes + // for insertion. + (void) largestGlobals.erase( existing ); + insert = true; + // By induction, furthermore, there are no other entries in largestGlobals + // that could contain the current tensor. So we terminate the check as + // well. + break; + } + } else { + bool smaller = true; + for( const unsigned int &axis : current ) { + if( existing.find( axis ) == existing.cend() ) { + // check if the differing axis is a dynamic one + if( getIteratedAxes().find( axis ) != getIteratedAxes().cend() ) { + differingDynamicAxesPresent = true; + } + // in this case, current is not a subset of this entry in largestGlobals, + // so we cannot that current is smaller-- check next one + smaller = false; + break; + } + } + if( smaller ) { + // in this case, current is a subset of this entry in largestGlobals, and + // so we can ignore the current tensor and move to the next one + insert = false; + // for allowing the analytic model to compute the exact buffer usage, we + // still record the tensor + minorTensors.push_back( tensor ); + break; + } + } + } + if( insert ) { + std::set< int > tempSet( current.cbegin(), current.cend() ); + (void) largestGlobals.insert( tempSet ); + } + } + } + + // start codegen: constructor + os << "\tasc::AnalyticModel< " << igrid->getProcessOrder() << ", " + << igrid->getProblemOrder() << ", " + << (differingDynamicAxesPresent ? "true" : "false") + << " > am( " << ub_size << ", { "; + os << "_" << igrid->processSize( 0 ); + for( size_t i = 1; i < igrid->getProcessOrder(); ++i ) { + os << ", _" << igrid->processSize( i ); + } + os << " }, { "; + os << "_" << igrid->problemSize( 0 ); + for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) { + os << ", _" << igrid->problemSize( i ); + } + os << " }, { "; + { + const auto &axes = getIteratedAxes(); + if( axes.find( 0 ) != axes.cend() ) { + os << "true"; + } else { + os << "false"; + } + for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) { + if( axes.find( i ) != axes.cend() ) { + os << ", true"; + } else { + os << ", false"; + } + } + } + os << " } );\n"; + + // add minor tensors + for( const auto &tensor : minorTensors ) { + const auto ¤t = tensor.getAxes(); + os << "\tam.addMinorTensor( sizeof( " + << internal::getDataType( tensor.getType() ) + << " ), { "; + if( std::find( current.cbegin(), current.cend(), 0 ) == current.cend() ) { + os << "false"; + } else { + os << "true"; + } + for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) { + if( std::find( current.cbegin(), current.cend(), i ) == current.end() ) { + os << ", false"; + } else { + os << ", true"; + } + } + os << " } );\n"; + } + + // add global non-minor tensors + for( const auto &tensor : accessed ) { + const auto &axes = tensor.getAxes(); + std::set< int > tempSet( axes.cbegin(), axes.cend() ); // TODO FIXME not the most performant code + if( tensor.getScope() != internal::Scope::GLOBAL ) { continue; } + if( largestGlobals.find( tempSet ) == largestGlobals.cend() ) { continue; } + assert( axes.size() > 0 ); + os << "\tam.addGlobalTensor( sizeof( " + << internal::getDataType( tensor.getType() ) + << " ), { "; + size_t k = 0; + if( std::find( axes.cbegin(), axes.cend(), 0 ) != axes.cend() ) { + os << "true"; + } else { + os << "false"; + } + (void) ++k; + for( ; k < igrid->getProblemOrder(); ++k ) { + if( std::find( axes.cbegin(), axes.cend(), k ) != axes.cend() ) { + os << ", false"; + } else { + os << ", true"; + } + } + os << " } );\n"; + } + + // add buffers + for( const auto &tensor : accessed ) { + if( tensor.getScope() != internal::Scope::GLOBAL ) { + const auto &axes = tensor.getAxes(); + os << "\tam.addBuffer( sizeof( " + << internal::getDataType( tensor.getType() ) + << " ), { "; + if( std::find( axes.cbegin(), axes.cend(), 0 ) == axes.cend() ) { + os << "false"; + } else { + os << "true"; + } + for( size_t i = 1; i < igrid->getProblemOrder(); ++i ) { + if( std::find( axes.cbegin(), axes.cend(), i ) == axes.cend() ) { + os << ", false"; + } else { + os << ", true"; + } + } + os << " } );\n"; + } + } + + // add stages + // TODO ideally, all AscendC functions have a unique identifier and an array of + // those are passed to the analytic model. For now, we just transfer a + // count instead. + os << "\tam.setNumStages( " << stages.size() << " );\n"; + + // Now, finally, the analytic model has all info it needs -- get the blocksizes! + for( auto axes : getIteratedAxes() ) { + os << "\tconst uint32_t _tile_size" << axes << " = am.getBlockSize( " + << axes << " );\n"; + } + os << "\n"; + + + // done: move this to the host + // for( auto axes : getIteratedAxes() ) { + // os << "\tconst uint32_t _tile_size" << axes << " = 1;\n"; + // } + // os << "\n"; + + analyticModelConstrBody << "\n"; + for( const auto &axes : getIteratedAxes() ) { + analyticModelConstrBody << "\t\t\ttile_size" << axes << " = _tile_size" << axes << ";\n"; + analyticModelFormalParams << ", const uint32_t _tile_size" << axes; + analyticModelDecls << "\t\tuint32_t tile_size" << axes << ";\n\n"; + analyticModelArgs << ", _tile_size" << axes; + } + // end analytic model code block +} + +void alp::internal::AscendPipeline::generateInit( std::stringstream &init ) { + + for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) { + if( it->isGlobalDecl() ) { + + assert( it->getAxes().size() > 0 ); + + // n0 * n1 * n2 * n3 ... + std::string set_numerator( igrid->problemSize( *( it->getAxes().begin() ) ) ); + // p0 * p1 * p2 * n3 ... + std::string set_denominator( igrid->processSize( *( it->getAxes().begin() ) ) ); + for( auto jt = ++it->getAxes().begin(); jt != it->getAxes().end(); ++jt ) { + + set_numerator.append( " * " + igrid->problemSize( *jt ) ); + set_denominator.append( " * " + igrid->processSize( *jt ) ); + } + + // n2 * n3 ... (e.g., n0 and n1 are excluded since they are the loop axes) + std::string non_parallel_init_numerator; + + for( auto jt = stages.cbegin(); jt != stages.cend(); ++jt ) { + + if( jt->getOpType() == internal::Stagetype::GET_VIEW && jt->getTensor0().getID() == it->getID() ) { + + bool first = true; + for( auto kt = it->getAxes().cbegin(); kt != it->getAxes().cend(); ++kt ) { + if( std::find( jt->getForEachAxes().begin(), jt->getForEachAxes().end(), ( int ) *kt ) == jt->getForEachAxes().end() ) { + if( !first ) { + non_parallel_init_numerator.append( " * " ); + } + non_parallel_init_numerator.append( igrid->problemSize( *kt ) ); + first = false; + } + } + break; + } + } + + if( non_parallel_init_numerator.empty() ) { + non_parallel_init_numerator.assign( "1" ); + } + + std::string tiling_init_numerator = getTilingAxes(); + + init << "\n"; + init << "\t\t\t// Initializing data for a Global Tensor\n"; + init << "\t\t\t" << it->getAscendGlobalName( id ) + << ".SetGlobalBuffer( ( __gm__ " + << internal::getDataType( it->getType() ) << " * )" + << it->getName() << " + ( " << set_numerator << " ) / ( " << set_denominator << " ) * GetBlockIdx(), ( " + << set_numerator << " ) / ( " << set_denominator << " ) );\n"; + init << "\t\t\tpipe.InitBuffer( " << it->getTQueBufName( id ) << ", BUFFER_NUM, " + << tiling_init_numerator + << "( ( " << non_parallel_init_numerator << " ) / BUFFER_NUM ) * sizeof( " + << internal::getDataType( it->getType() ) << " ) );\n"; + } + } +/* + //TODO these two loops can be fused, the only reason are written that way + // was to solve or avoid a bug regarding the order of the init + for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) { + if( it->isTempDecl() ) { + init << "\n"; + init << "\t\t\t// Initializing data for a temporary Tensor\n"; + init << "\t\t\tpipe.InitBuffer( " << it->getTQueBufName( id ) + << ", " << "totWorkSpaceSize" << " );\n"; + init << "\t\t\t" << it->getAscendName( id ) << " = " + << it->getTQueBufName( id ) + << ".Get< " << internal::getDataType( it->getType() ) << " >();\n"; + } + } + + for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) { + if( it->isLocalDecl() ) { + init << "\n"; + //TODO fix that + std::vector< int > forEachParallelAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); + const std::vector< int > &axes = it->getAxes(); + std::vector< int > local_iterated_axes; + + //TODO: is this set correct? is it necessary, if yes, perhaps a sort is required + local_iterated_axes = internal::vectorDifference( axes, forEachParallelAxes ); + + std::string product_dim(""); + bool first = true; + for( auto it = local_iterated_axes.cbegin(); it != local_iterated_axes.cend(); ++it ) { + if( first == true ) { + first = false; + } else { + product_dim.append(" * "); + } + product_dim.append( igrid->problemSize( *it ) ); + } + + if( product_dim.empty() == true ) { + product_dim.append( "1" ); + } + init << "\t\t\t// Initializing data for a local Tensor\n"; + init << "\t\t\tpipe.InitBuffer( " << it->getTQueBufName( id ) + << ", " << product_dim << " );\n"; + init << "\t\t\t" << it->getAscendName( id ) << " = " + << it->getTQueBufName( id ) + << ".Get< " << internal::getDataType( it->getType() ) << " >();\n"; + } + } +*/ + init << "\n"; + + std::string prev("totWorkSpaceSize"); + std::string prev_dim(""); + for( auto it = accessed.cbegin(); it != accessed.cend(); ++it ) { + if( it->isLocalDecl() || it->isTempDecl() ) { + init << "\t\t\t" << it->getAscendName( id ) << " = " << prev << ( prev_dim.empty() ? ";\n" : ( " + " + prev_dim + ";\n" ) ); + + if( it->getAxes().size() > 0 ) { + // n0 * n1 * n2 ... + std::string set_numerator( igrid->problemSize( *( it->getAxes().begin() ) ) ); + for( auto jt = ++it->getAxes().begin(); jt != it->getAxes().end(); ++jt ) { + + set_numerator.append( " * " + igrid->problemSize( *jt ) ); + } + prev_dim = set_numerator; + } else { + prev_dim = "16"; + } + prev = it->getAscendName( id ); + } + } +} + +void alp::internal::AscendPipeline::generateProcess( std::stringstream &process, + std::stringstream &processCall ) { + + processCall << "\t\t\tProcess" << id << "();\n"; + + // generate the Process function + // TODO here we should use the grid info and symbolic analytic model + + process << "\n"; + process << "\t\t__aicore__ inline void Process" << id << "() {\n"; + process << "\n"; + + std::string tabs(""); + + // use a stack to keep track of the for loops that are already generated + std::vector< int > stack; +// std::vector< std::pair< std::string, std::pair< std::string, std::string > > > tiling_stack; + + int parallel_axe = *( stages.cbegin()->getForEachAxes().begin() ); + // initialize the stack with the axe of the outer forEach + // which is the parallel loop and thus can be omitted + stack.push_back( parallel_axe ); + +// bool new_nested_level = true; + + // declare variables for the upper bound of the extra loops that are introduced + std::set< int > iterated_axes = getIteratedAxes(); + for( auto it = iterated_axes.cbegin(); it != iterated_axes.cend(); ++it ) { + process << tabs << "\t\t\tuint32_t upper_" << igrid->problemTileMode( *it ) << ";\n"; + } + + process << "\n"; + + process << tabs << "\t\t\tfor( uint32_t " << igrid->problemMainMode( parallel_axe ) + << " = 0; " << igrid->problemMainMode( parallel_axe ) << " < ( " + << igrid->problemSize( parallel_axe ) << " / " << igrid->processSize( parallel_axe ) << " ); " + << igrid->problemMainMode( parallel_axe ) << " += tile_size" << parallel_axe << " ) {\n"; + + tabs.append("\t"); + +/* std::stringstream tiling_loop, tiling_condition, tiling_var; + + tiling_condition.str(""); + tiling_condition << "\t\t\tupper_" << igrid->problemTileMode( parallel_axe ) << " = ( " + << "( " << igrid->problemSize( parallel_axe ) << " / " << igrid->processSize( parallel_axe ) + << " ) < ( " << igrid->problemMainMode( parallel_axe ) << " + tile_size" << parallel_axe << " ) ) ? " + << "( (" << igrid->problemSize( parallel_axe ) << " / " << igrid->processSize( parallel_axe ) << " ) - " + << igrid->problemMainMode( parallel_axe ) << " ) : ( tile_size" << parallel_axe << " );\n"; + + // the tiling loop is not added in the stack of generated loops + tiling_loop.str(""); + tiling_loop << "\t\t\tfor( uint32_t " << igrid->problemTileMode( parallel_axe ) + << " = " << "0" << "; " << igrid->problemTileMode( parallel_axe ) + << " < upper_" << igrid->problemTileMode( parallel_axe ) << "; " << igrid->problemTileMode( parallel_axe ) << "++ ) {\n"; + + tiling_var.str(""); + tiling_var << "\t\t\t\tconst uint32_t " << igrid->problemMode( parallel_axe ) << " = " << igrid->problemMainMode( parallel_axe ) + << " + " << igrid->problemTileMode( parallel_axe ) << ";\n"; + + + tiling_stack.push_back( std::make_pair( tiling_condition.str(), std::make_pair( tiling_loop.str(), tiling_var.str() ) ) ); +*/ + std::vector< int > prev_stage_axes; + + // generate AscendC code for the operators of the pipeline + for( auto it = stages.cbegin(); it != stages.cend(); ++it ) { + + // get the axes of the current stage + const std::vector< int > &forEachAxes = it->getForEachAxes(); + + // iterator of the stack + auto st = stack.begin(); + // iterator of the axes for the current stage + auto at = forEachAxes.begin(); + + // the number of axes that are currently in the stack + // and match the corresponding axes of the current stage + size_t match_axes = 0; + + // iterate over all the axes of the stack that match + // those of the stage, which implies that if the current + // stage goes into the current for loop, i.e., no loop + // needs to be created and no loop needs to be closed, + // all the axes should match + while( st != stack.end() ) { + + if( at == forEachAxes.end() || *st != *at ) { + break; + } + + ++match_axes; + ++st; + ++at; + } + + // if there was a mismatch on the axes between the + // already generated loops (stack) and the axes of the stage + // then the axes of the stack that do not match should be popped + // which implies that the generated loops should close + size_t to_pop_axes = stack.size() - match_axes; +/* + if( to_pop_axes > 0 ) { + // close the loops for tiling first + for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) { + process << tabs << "\t\t}\n"; + tabs.pop_back(); + } + } +*/ + for( size_t i = 0; i < to_pop_axes; ++i ) { + +// tiling_stack.pop_back(); + + process << tabs << "\t\t}\n"; + stack.pop_back(); + tabs.pop_back(); + } +/* + // generate tiling loops if at least one loop of axes was closed + if( to_pop_axes > 0 ) { + for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) { + process << "\n"; + process << tabs << jt->first; + process << tabs << jt->second.first; + process << "\n"; + process << tabs << jt->second.second; + tabs.append("\t"); + } + } +*/ + // iterator of the stack + st = stack.begin(); + // iterator of the axes for the current stage + at = forEachAxes.begin(); + + // iterate over all the axes of the stage as long as the + // corresponding axes are already in the stack, which implies + // the for loops are already generated + while( at != forEachAxes.end() ) { + + if( st == stack.end() ) { + break; + } + + // as long as the end of the stack was not reached + // the axes should match those of the current stage + // since all the elements did not match were popped + assert( *st != *at ); + + ++st; + ++at; + } +/* + // close tiling loops provides that + // a) no loop was already closed, otherwise the corresponding loops are closed + // b) this is not the first stage + // c) the axes of the previous stage are different than those of the current stage + // a situation that indicates these two stages are not nested in the same level + if( to_pop_axes == 0 && it != stages.cbegin() && prev_stage_axes != forEachAxes ) { + for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) { + process << tabs << "\t\t}\n"; + tabs.pop_back(); + } + + } +*/ + // iterate over the rest of the axes of the stage, i.e., those + // that are not included in the stack and lead to generation of for loops + while( at != forEachAxes.end() ) { + +// new_nested_level = true; + + process << "\n"; + + process << tabs << "\t\t\tfor( uint32_t " << igrid->problemMainMode( *at ) + << " = 0; " << igrid->problemMainMode( *at ) << " < " + << igrid->problemSize( *at ) << "; " + << igrid->problemMainMode( *at ) << " += tile_size" << *at << " ) {\n"; + + tabs.append("\t"); + stack.push_back( *at ); +/* + tiling_condition.str(""); + tiling_condition << "\t\t\tupper_" << igrid->problemTileMode( *at ) << " = ( " + << igrid->problemSize( *at ) << " < ( " << igrid->problemMainMode( *at ) << " + tile_size" << *at << " ) ) ? " + << igrid->problemSize( *at ) << " - " << igrid->problemMainMode( *at ) << " : ( tile_size" << *at << " );\n"; + + // the tiling loop is not added in the stack of generated loops + tiling_loop.str(""); + tiling_loop << "\t\t\tfor( uint32_t " << igrid->problemTileMode( *at ) + << " = " << "0" << "; " << igrid->problemTileMode( *at ) << " < upper_" + << igrid->problemTileMode( *at ) << "; " << igrid->problemTileMode( *at ) << "++ ) {\n"; + + tiling_var.str(""); + tiling_var << "\t\t\t\tconst uint32_t " << igrid->problemMode( *at ) << " = " << igrid->problemMainMode( *at ) + << " + " << igrid->problemTileMode( *at ) << ";\n"; + + tiling_stack.push_back( std::make_pair( tiling_condition.str(), std::make_pair( tiling_loop.str(), tiling_var.str() ) ) ); +*/ ++at; + } +/* + if( new_nested_level ) { + for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) { + process << "\n"; + process << tabs << jt->first; + process << tabs << jt->second.first; + process << "\n"; + process << tabs << jt->second.second; + tabs.append("\t"); + } + } +*/ + process << "\n"; + process << it->getOp( tabs ); + + // reset the flag to false +// new_nested_level = false; + + // set the axes of the previous stage to those of the current one + prev_stage_axes = forEachAxes; + } +/* + if( stack.size() > 0 ) { + // before closing a loop, all the generated loops for tiling should close as well + for( auto jt = tiling_stack.begin(); jt != tiling_stack.end(); ++jt ) { + process << tabs << "\t\t}\n"; + tabs.pop_back(); + } + } +*/ + // close all the generated for loops + // starting from 0 to generate the parallel/outer loop + // starting from 1 if the outer parallel/loop is not generated + for( size_t i = 0; i < stack.size(); ++i ) { + + process << tabs << "\t\t}\n"; + tabs.pop_back(); + } + + // the curly bracket for the process function + process << "\t\t}\n"; +} + +void alp::internal::AscendPipeline::debug_print() const { + + std::cerr << "ACCESSED: "; + for (auto it = accessed.cbegin(); it != accessed.cend(); ++it ) { + std::cerr << it->getName() << ", "; + } + + std::cerr << std::endl << std::endl << std::endl; +} diff --git a/src/graphblas/ascend/semantics.cpp b/src/graphblas/ascend/semantics.cpp new file mode 100644 index 000000000..68159593d --- /dev/null +++ b/src/graphblas/ascend/semantics.cpp @@ -0,0 +1,68 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include + +#include +#include +#include + +namespace alp +{ + namespace internal + { + extern iGrid *igrid; + } +} + +bool alp::internal::invalidForEachAxes( const std::vector< int > &axes ) { + + std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); + + for( auto it = axes.cbegin(); it != axes.cend(); ++it ) { + + if( std::find( axes.cbegin(), axes.cend(), *it ) != forEachAxes.cend() + && std::find( axes.cbegin(), axes.cend(), *it ) != it ) { + return true; + } + if( std::find( forEachAxes.cbegin(), forEachAxes.cend(), *it ) != forEachAxes.cend() ) { + return true; + } + } + + return false; +} + +bool alp::internal::invalidAxes( const std::vector< int > &axes ) { + + std::vector< int > forEachAxes = internal::vectorOfVectorsToVector( internal::OpGen::forEachAxes ); + std::vector< int > sorted_axes_copy = axes; + std::vector< int > intersection; + + std::sort( forEachAxes.begin(), forEachAxes.end() ); + std::sort( sorted_axes_copy.begin(), sorted_axes_copy.end() ); + + std::set_intersection( + forEachAxes.begin(), forEachAxes.end(), + sorted_axes_copy.begin(), sorted_axes_copy.end(), + std::back_inserter( intersection ) + ); + + return ( intersection.size() > 0 ); +} + diff --git a/src/graphblas/ascend/stage.cpp b/src/graphblas/ascend/stage.cpp new file mode 100644 index 000000000..ef8430166 --- /dev/null +++ b/src/graphblas/ascend/stage.cpp @@ -0,0 +1,985 @@ +#include +#include +#include +#include + +namespace alp +{ + namespace internal + { + extern iGrid *igrid; + extern SymbolTable symbols; + } +} + +//TODO double should be replaced by alp::Scalar +alp::internal::Stage::Stage( const AscendPipeline &parent, + Stagetype _enum_op_type, Rule _rule, + const alp::Tensor &_tensor0, + const double _alpha, + const std::vector< int > &_forEachAxes ) + : pipeline( parent ), + enum_op_type( _enum_op_type ), + rule( _rule ), + tensor0( _tensor0 ), + alpha( _alpha ), + forEachAxes( _forEachAxes ) +{ + semanticsCheks(); + computeMemoryOffsets(); +} + +alp::internal::Stage::Stage( const AscendPipeline &parent, + Stagetype _enum_op_type, Rule _rule, + const alp::Tensor &_tensor0, + const std::vector< int > &_activeAxes, + const std::vector< int > &_forEachAxes ) + : pipeline( parent ), + enum_op_type( _enum_op_type ), + rule( _rule ), + tensor0( _tensor0 ), + activeAxes( _activeAxes ), + forEachAxes( _forEachAxes ) +{ + semanticsCheks(); + computeMemoryOffsets(); +} + +alp::internal::Stage::Stage( const AscendPipeline &parent, + Stagetype _enum_op_type, Rule _rule, + const alp::Tensor &_tensor0, + const alp::Tensor &_tensor1, + const std::vector< int > &_activeAxes, + const std::vector< int > &_forEachAxes ) + : pipeline( parent ), + enum_op_type( _enum_op_type ), + rule( _rule ), + tensor0( _tensor0 ), + tensor1( _tensor1 ), + activeAxes( _activeAxes ), + forEachAxes( _forEachAxes ) +{ + semanticsCheks(); + computeMemoryOffsets(); +} + +alp::internal::Stage::Stage( const AscendPipeline &parent, + Stagetype _enum_op_type, Rule _rule, + const alp::Tensor &_tensor0, + const alp::Tensor &_tensor1, + const alp::Tensor &_tensor2, + const std::vector< int > &_activeAxes, + const std::vector< int > &_forEachAxes ) + : pipeline( parent ), + enum_op_type( _enum_op_type ), + rule( _rule ), + tensor0( _tensor0 ), + tensor1( _tensor1 ), + tensor2( _tensor2 ), + activeAxes( _activeAxes ), + forEachAxes( _forEachAxes ) +{ + semanticsCheks(); + computeMemoryOffsets(); +} +/* +alp::internal::Stage::Stage( const AscendPipeline &parent, + Stagetype _enum_op_type, Rule _rule, + const alp::Tensor &_tensor0, + const alp::Tensor &_tensor1, + const alp::Tensor &_tensor2, + const alp::Tensor &_tensor3, + const std::vector< int > &_activeAxes, + const std::vector< int > &_forEachAxes ) + : pipeline( parent ), + enum_op_type( _enum_op_type ), + rule( _rule ), + tensor0( _tensor0 ), + tensor1( _tensor1 ), + tensor2( _tensor2 ), + tensor3( _tensor3 ), + activeAxes( _activeAxes ), + forEachAxes( _forEachAxes ) +{ + semanticsCheks(); + computeMemoryOffsets(); +} +*/ +alp::internal::Stagetype alp::internal::Stage::getOpType() const { + + return enum_op_type; +} + +alp::internal::Rule alp::internal::Stage::getRule() const { + + return rule; +} + +const alp::Tensor & alp::internal::Stage::getTensor0() const { + + return tensor0; +} + +const std::vector< int > & alp::internal::Stage::getAxes() const { + + return activeAxes; +} + +const std::vector< int >& alp::internal::Stage::getForEachAxes() const { + + return forEachAxes; +} + +std::string alp::internal::Stage::getOp( const std::string &tabs ) const { + + switch (enum_op_type) { + case alp::internal::Stagetype::APPLY_MINUS: + return generateApplyMinusOp( tabs ); + case alp::internal::Stagetype::APPLY_ADD: + return generateApplyAddOp( tabs ); + case alp::internal::Stagetype::FOLDL_DIVIDE: + return generateFoldlDivideOp( tabs ); + case alp::internal::Stagetype::FOLDL_MAX: + return generateFoldlMaxOp( tabs ); + case alp::internal::Stagetype::FOLDL_TIMES: + return generateFoldlTimesOp( tabs ); + case alp::internal::Stagetype::FOLDL_ADD: + return generateFoldlAddOp( tabs ); + case alp::internal::Stagetype::FOLDL_EXP: + return generateFoldlExpOp( tabs ); + case alp::internal::Stagetype::SET_TENSOR: + return generateSetTensorOp( tabs ); + case alp::internal::Stagetype::SET_SCALAR: + return generateSetScalarOp( tabs ); + case alp::internal::Stagetype::GET_VIEW: + return generateGetViewOp( tabs ); + case alp::internal::Stagetype::STORE: + return generateStoreOp( tabs ); + case alp::internal::Stagetype::IMPLICIT_FREE: + return generateImplicitFreeOp( tabs ); + default: + return generateToDoOp( tabs ); + } +} + +std::string alp::internal::Stage::generateApplyMinusOp( const std::string &tabs ) const { + + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() ); + const std::string arg3 = tensor2.getAccessedElement( pipeline.getID() ); +// const std::string arg4 = tensor3.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + switch ( rule ) { + case Rule::EWISE: + { + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorEwiseMinus( " << arg1 << ", " << arg2 << ", " + << arg3 << ", " << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockEwiseMinus( " << arg1 << ", " << arg2 << ", " + << arg3 << ", " << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + break; + } + case Rule::BCAST: + { + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorBcastMinus( " << arg1 << ", " << arg2 << ", " << arg3 << ", " + << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockBcastMinus( " << arg1 << ", " << arg2 << ", " << arg3 << ", " + << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + break; + } + case Rule::REDUCE: + { + break; + } + default: + { + std::cerr << "Invalid rule: apply minus" << std::endl; + std::abort(); + } + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateApplyAddOp( const std::string &tabs ) const { + + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() ); + const std::string arg3 = tensor2.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + switch ( rule ) { + case Rule::EWISE: + { + if( op_axes.size() == 0) { + stage << tabs << "\t\t\tAdd( " << arg1 << ", " << arg2 << ", " << arg3 << ", " + << pipeline.getTilingAxes() << "1" << " );\n"; + } else if( op_axes.size() == 1) { + stage << tabs << "\t\t\tAdd( " << arg1 << ", " << arg2 << ", " << arg3 << ", " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\tAdd( " << arg1 << ", " << arg2 << ", " << arg3 << ", " + << igrid->problemSize( op_axes[ 0 ] ) << " * " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + break; + } + case Rule::BCAST: + { + break; + } + case Rule::REDUCE: + { + break; + } + default: + { + std::cerr << "Invalid rule: apply add" << std::endl; + std::abort(); + } + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateFoldlDivideOp( const std::string &tabs ) const { + + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() ); +// const std::string arg3 = tensor2.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + switch ( rule ) { + case Rule::EWISE: + { + break; + } + case Rule::BCAST: + { + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorBcastDivide( " << arg1 << ", " << arg1 << ", " << arg2 << ", " + << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockBcastDivide( " << arg1 << ", " << arg1 << ", " << arg2 << ", " + << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + break; + } + case Rule::REDUCE: + { + break; + } + default: + { + std::cerr << "Invalid rule: foldl divide" << std::endl; + std::abort(); + } + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateFoldlMaxOp( const std::string &tabs ) const { + + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + switch ( rule ) { + case Rule::EWISE: + { + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorEwiseMax( " << arg1 << ", " << arg1 << ", " + << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockEwiseMax( " << arg1 << ", " << arg1 << ", " + << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + break; + } + case Rule::BCAST: + { + break; + } + case Rule::REDUCE: + { + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorReduceMax( " << arg1 << ", " << arg2 << ", " + << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockReduceMax( " << arg1 << ", " << arg2 << ", " + << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + break; + } + default: + { + std::cerr << "Invalid rule: foldl max" << std::endl; + std::abort(); + } + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateFoldlTimesOp( const std::string &tabs ) const { + + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + switch ( rule ) { + case Rule::EWISE: + { + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorEwiseMultiply( " << arg1 << ", " << arg1 << ", " + << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockEwiseMultiply( " << arg1 << ", " << arg1 << ", " + << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + break; + } + case Rule::BCAST: + { + break; + } + case Rule::REDUCE: + { + break; + } + default: + { + std::cerr << "Invalid rule: foldl times" << std::endl; + std::abort(); + } + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateFoldlAddOp( const std::string &tabs ) const { + + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + switch ( rule ) { + case Rule::EWISE: + { + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorEwiseSum( " << arg1 << ", " << arg1 << ", " + << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockEwiseSum( " << arg1 << ", " << arg1 << ", " + << arg2 << ", " << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + break; + } + case Rule::BCAST: + { + break; + } + case Rule::REDUCE: + { + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorReduceSum( " << arg1 << ", " << arg2 << ", " + << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockReduceSum( " << arg1 << ", " << arg2 << ", " + << alp::internal::symbols.getLocalTempTensorBuffer( tensor0.getType() ) << "[ 0 ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + break; + } + default: + { + std::cerr << "Invalid rule: foldl add" << std::endl; + std::abort(); + } + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateFoldlExpOp( const std::string &tabs ) const { + + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorExp( " << arg1 << ", " << arg1 + << ", " << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockExp( " << arg1 << ", " << arg1 + << ", " << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateSetTensorOp( const std::string &tabs ) const { + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + const std::string arg2 = tensor1.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorSet( " << arg1 << ", " << arg2 << ", " + << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockSet( " << arg1 << ", " << arg2 << ", " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateSetScalarOp( const std::string &tabs ) const { + + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + const std::string scalar = ( alpha == std::numeric_limits< double >::infinity() ) ? "65504.0" + : ( alpha == -std::numeric_limits< double >::infinity() ) ? "-65504.0" + : std::to_string( alpha ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + if( op_axes.size() == 1) { + stage << tabs << "\t\t\talp::VectorSet( " << arg1 << ", " << scalar << ", " + << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + } else if( op_axes.size() == 2) { + stage << tabs << "\t\t\talp::BlockSet( " << arg1 << ", " << scalar << ", " + << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateGetViewOp( const std::string &tabs ) const { + + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + const size_t id = pipeline.getID(); + + if( pipeline.isOutput( tensor0 ) == true ) { + stage << tabs << "\t\t\t// Initializing data for an output global Tensor\n"; + stage << tabs << "\t\t\t" << tensor0.getAscendName( id ) << " = " + << tensor0.getTQueBufName( id ) << ".AllocTensor< " + << internal::getDataType( tensor0.getType() ) << " >();\n"; + } else { + stage << tabs << "\t\t\t// Initializing data for an input global Tensor\n"; + stage << tabs << "\t\t\t" << tensor0.getAscendName( id ) << " = " + << tensor0.getTQueBufName( id ) << ".AllocTensor< " + << internal::getDataType( tensor0.getType() ) << " >();\n"; + + if( op_axes.size() == 0 ) { + stage << tabs << "\t\t\talp::DataMove( " << tensor0.getAscendName( id ) + << "[ " << "0" << " ], " + << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], " + << pipeline.getTilingAxes() << "1" << " );\n"; + + }else if( op_axes.size() == 1 ) { +// stage << tabs << "\t\t\tDataCopy( " << tensor0.getAscendName( id ) << ", " +// << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], " +// << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + +// stage << tabs << "\t\t\talp::DataMove( " << tensor0.getAscendName( id ) << ", " +// << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], " + + stage << tabs << "\t\t\talp::DataMove( " << tensor0.getAscendName( id ) + << "[ " << "0" << " ], " + << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + + } else if( op_axes.size() == 2) { +// stage << tabs << "\t\t\tfor( uint32_t k = 0; k < " +// << igrid->problemSize( op_axes[ 0 ] ) << "; k++ ) {\n"; + +// stage << tabs << "\t\t\t\tDataCopy( " << tensor0.getAscendName( id ) +// << "[ k * " << igrid->problemSize( op_axes[ 1 ] ) << " ], " +// << tensor0.getAscendGlobalName( id ) +// << "[ " << tensor0_offset << " + k" << stride << " ], " +// << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + +// stage << tabs << "\t\t\t}\n"; + + stage << tabs << "\t\t\talp::DataMove( " << tensor0.getAscendName( id ) + << "[ " << "0" << " ], " + << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << ", " + << stride << ", " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + } + + stage << tabs << "\t\t\t" << tensor0.getTQueBufName( id ) + << ".EnQue( " << tensor0.getAscendName( id ) << " );\n"; + + stage << tabs << "\t\t\t" << tensor0.getAscendName( id ) << " = " + << tensor0.getTQueBufName( id ) + << ".DeQue< " << internal::getDataType( tensor0.getType() ) << " >();\n"; + } + + return stage.str(); +} + +std::string alp::internal::Stage::generateStoreOp( const std::string &tabs ) const { + + //TODO I should use the arg1 + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + + const std::vector< int > op_axes = computeOperatorAxes(); + std::stringstream stage; + + const size_t id = pipeline.getID(); + + stage << tabs << "\t\t\t// Copying data of an output Tensor back to the global memory\n"; + stage << tabs << "\t\t\t" << tensor0.getTQueBufName( id ) + << ".EnQue< " << internal::getDataType( tensor0.getType() ) + << " >( " << tensor0.getAscendName( id ) << " );\n"; + stage << tabs << "\t\t\t" << tensor0.getAscendName( id ) << " = " + << tensor0.getTQueBufName( id ) << ".DeQue< " + << internal::getDataType( tensor0.getType() ) << " >();\n"; + + if( op_axes.size() == 0) { + stage << tabs << "\t\t\talp::DataMove( " + << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], " + << tensor0.getAscendName( id ) << "[ " << "0" << " ], " + << pipeline.getTilingAxes() << "1" << " );\n"; + } else if( op_axes.size() == 1) { +// stage << tabs << "\t\t\tDataCopy( " << tensor0.getAscendGlobalName( id ) +// << "[ " << tensor0_offset << " ], " << tensor0.getAscendName( id ) << ", " +// << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + + stage << tabs << "\t\t\talp::DataMove( " + << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], " + << tensor0.getAscendName( id ) << "[ " << "0" << " ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << " );\n"; + + } else if( op_axes.size() == 2) { +/* + stage << tabs << "\t\t\tfor( uint32_t k = 0; k < " + << igrid->problemSize( op_axes[ 0 ] ) << "; k++ ) {\n"; + + stage << tabs << "\t\t\t\tDataCopy( " << tensor0.getAscendGlobalName( id ) + << "[ " << tensor0_offset << " + k" << stride << " ], " + << tensor0.getAscendName( id ) << "[ k * " << igrid->problemSize( op_axes[ 1 ] ) << " ], " + << igrid->problemSize( op_axes[ 1 ] ) << " );\n"; + + stage << tabs << "\t\t\t}\n"; +*/ + stage << tabs << "\t\t\talp::DataMove( " + << tensor0.getAscendGlobalName( id ) << "[ " << tensor0_offset << " ], " + << tensor0.getAscendName( id ) << "[ " << "0" << " ], " + << pipeline.getTilingAxes() << igrid->problemSize( op_axes[ 0 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << ", " + << igrid->problemSize( op_axes[ 1 ] ) << ", " + << stride << " );\n"; + } + + stage << tabs << "\t\t\t" << tensor0.getTQueBufName( id ) + << ".FreeTensor( " << tensor0.getAscendName( id ) << " );\n"; + + return stage.str(); +} + +std::string alp::internal::Stage::generateImplicitFreeOp( const std::string &tabs ) const { + + //TODO I should use the arg1 + const std::string arg1 = tensor0.getAccessedElement( pipeline.getID() ); + + std::stringstream stage; + + const size_t id = pipeline.getID(); + + stage << tabs << "\t\t\t// Freeing data of a Tensor that is not output\n"; + stage << tabs << "\t\t\t" << tensor0.getTQueBufName( id ) + << ".FreeTensor( " << tensor0.getAscendName( id ) << " );\n"; + + return stage.str(); +} + +std::string alp::internal::Stage::generateToDoOp( const std::string &tabs ) const { + + return tabs + std::string(""); +} + +//TODO: perhaps rename it to computeUnionAxes + +std::vector< int > alp::internal::Stage::computeOperatorAxes() const { + + // initializing the union with the axes of tensor0 used by all operators + std::vector< int > union_axes = tensor0.getAxes(); + + switch ( enum_op_type ) { + + case alp::internal::Stagetype::APPLY_MINUS: // 3 Tensors + case alp::internal::Stagetype::APPLY_ADD: // 3 Tensors + case alp::internal::Stagetype::FOLDL_DIVIDE: // 2 Tensors + case alp::internal::Stagetype::SET_TENSOR: // 2 Tensors + case alp::internal::Stagetype::FOLDL_MAX: // 2 Tensors + case alp::internal::Stagetype::FOLDL_TIMES: // 2 Tensors + case alp::internal::Stagetype::FOLDL_ADD: // 2 Tensors + { + const std::vector< int > &tensor1_axes = tensor1.getAxes(); + union_axes = internal::vectorUnion( union_axes, tensor1_axes ); + break; + } + case alp::internal::Stagetype::FOLDL_EXP: // 1 Tensor + case alp::internal::Stagetype::SET_SCALAR: // 1 Tensor + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor +// default: + break; + } + + switch ( enum_op_type ) { + + case alp::internal::Stagetype::APPLY_MINUS: // 3 Tensors + case alp::internal::Stagetype::APPLY_ADD: // 3 Tensors + { + const std::vector< int > &tensor2_axes = tensor2.getAxes(); + union_axes = internal::vectorUnion( union_axes, tensor2_axes ); + break; + } + case alp::internal::Stagetype::FOLDL_DIVIDE: // 2 Tensors + case alp::internal::Stagetype::SET_TENSOR: // 2 Tensors + case alp::internal::Stagetype::FOLDL_MAX: // 2 Tensors + case alp::internal::Stagetype::FOLDL_TIMES: // 2 Tensors + case alp::internal::Stagetype::FOLDL_ADD: // 2 Tensors + case alp::internal::Stagetype::FOLDL_EXP: // 1 Tensor + case alp::internal::Stagetype::SET_SCALAR: // 1 Tensor + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor +// default: + break; + } + + // only in the case of GET_VIEW and STORE + // we need to remove the axes of the loops + // because the stored axes are those of the parent + // FIXME: perhaps we should change this design and handle views + // as different objects added to the symbol table + switch ( enum_op_type ) { + + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + { + union_axes = internal::vectorDifference( union_axes, forEachAxes ); + break; + } + // IMPLICIT_FREE is created based on STORE + // and this step is already done except that + // this function is not used by IMPLICIT_FREE + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor + case alp::internal::Stagetype::APPLY_MINUS: // 3 Tensors + case alp::internal::Stagetype::APPLY_ADD: // 3 Tensors + case alp::internal::Stagetype::FOLDL_DIVIDE: // 2 Tensors + case alp::internal::Stagetype::SET_TENSOR: // 2 Tensors + case alp::internal::Stagetype::FOLDL_MAX: // 2 Tensors + case alp::internal::Stagetype::FOLDL_TIMES: // 2 Tensors + case alp::internal::Stagetype::FOLDL_ADD: // 2 Tensors + case alp::internal::Stagetype::FOLDL_EXP: // 1 Tensor + case alp::internal::Stagetype::SET_SCALAR: // 1 Tensor +// default: + break; + } + + return union_axes; +} + +void alp::internal::Stage::computeMemoryOffsets(){ + + switch ( enum_op_type ) { + + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor + { + // for the GET_VIEW and STORE it's necessary to compute the expression for the stride + // we compute the stride only if the axes of the view are two + // more than two axes are not supported + // one axis does not require the stride + if( activeAxes.size() == 2 ) { + bool first = true; + for( int i = activeAxes[ 0 ] + 1; i <= activeAxes[ 1 ]; ++i ) { + if( first == true ) { + first = false; + stride.append( igrid->problemSize( i ) ); // n3 * n4 * n5 + } else { + stride.append( " * " + igrid->problemSize( i ) ); // n3 * n4 * n5 + } + } + } + break; + } + default: + break; + } + + switch ( enum_op_type ) { + +// case alp::internal::Stagetype::APPLY_MINUS: // 4 Tensors + case alp::internal::Stagetype::APPLY_MINUS: // 3 Tensors + case alp::internal::Stagetype::APPLY_ADD: // 3 Tensors + case alp::internal::Stagetype::FOLDL_DIVIDE: // 2 Tensors + case alp::internal::Stagetype::SET_TENSOR: // 2 Tensors + case alp::internal::Stagetype::FOLDL_MAX: // 2 Tensors + case alp::internal::Stagetype::FOLDL_TIMES: // 2 Tensors + case alp::internal::Stagetype::FOLDL_ADD: // 2 Tensors + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor + case alp::internal::Stagetype::FOLDL_EXP: // 1 Tensor + case alp::internal::Stagetype::SET_SCALAR: // 1 Tensor + { + const std::vector< int > &view_parent_0_axes = alp::internal::symbols.getTensorFromView( tensor0 ).getAxes(); + + bool first = true; + + for( auto it = forEachAxes.begin(); it != forEachAxes.end(); ++it ) { + + if( std::find( view_parent_0_axes.begin(), view_parent_0_axes.end(), *it ) != view_parent_0_axes.end() ) { + + if( !first ) { + tensor0_offset.append( " + " ); + } else { + first = false; + } + + tensor0_offset.append( igrid->problemMainMode( *it ) ); // z0 + for( auto jt = view_parent_0_axes.begin(); jt != view_parent_0_axes.end(); ++jt ) { + if( *jt > *it ) { + tensor0_offset.append( " * " + igrid->problemSize( *jt ) ); // n1 * n2 * n3 + } + } + } + } + break; + } +// default: + } + + switch ( enum_op_type ) { + +// case alp::internal::Stagetype::APPLY_MINUS: // 4 Tensors + case alp::internal::Stagetype::APPLY_MINUS: // 3 Tensors + case alp::internal::Stagetype::APPLY_ADD: // 3 Tensors + case alp::internal::Stagetype::FOLDL_DIVIDE: // 2 Tensors + case alp::internal::Stagetype::SET_TENSOR: // 2 Tensors + case alp::internal::Stagetype::FOLDL_MAX: // 2 Tensors + case alp::internal::Stagetype::FOLDL_TIMES: // 2 Tensors + case alp::internal::Stagetype::FOLDL_ADD: // 2 Tensors + { + const std::vector< int > &view_parent_1_axes = alp::internal::symbols.getTensorFromView( tensor1 ).getAxes(); + + bool first = true; + + for( auto it = forEachAxes.begin(); it != forEachAxes.end(); ++it ) { + + if( std::find( view_parent_1_axes.begin(), view_parent_1_axes.end(), *it ) != view_parent_1_axes.end() ) { + + if( !first ) { + tensor1_offset.append( " + " ); + } else { + first = false; + } + + tensor1_offset.append( igrid->problemMainMode( *it ) ); // z0 + for( auto jt = view_parent_1_axes.begin(); jt != view_parent_1_axes.end(); ++jt ) { + if( *jt > *it ) { + tensor1_offset.append( " * " + igrid->problemSize( *jt ) ); // n1 * n2 * n3 + } + } + } + } + break; + } + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor + case alp::internal::Stagetype::FOLDL_EXP: // 1 Tensor + case alp::internal::Stagetype::SET_SCALAR: // 1 Tensor +// default: + break; + } + + switch ( enum_op_type ) { + +// case alp::internal::Stagetype::APPLY_MINUS: // 4 Tensors + case alp::internal::Stagetype::APPLY_MINUS: // 3 Tensors + case alp::internal::Stagetype::APPLY_ADD: // 3 Tensors + { + const std::vector< int > &view_parent_2_axes = alp::internal::symbols.getTensorFromView( tensor2 ).getAxes(); + + bool first = true; + + for( auto it = forEachAxes.begin(); it != forEachAxes.end(); ++it ) { + + if( std::find( view_parent_2_axes.begin(), view_parent_2_axes.end(), *it ) != view_parent_2_axes.end() ) { + + if( !first ) { + tensor2_offset.append( " + " ); + } else { + first = false; + } + + tensor2_offset.append( igrid->problemMainMode( *it ) ); // z0 + for( auto jt = view_parent_2_axes.begin(); jt != view_parent_2_axes.end(); ++jt ) { + if( *jt > *it ) { + tensor2_offset.append( " * " + igrid->problemSize( *jt ) ); // n1 * n2 * n3 + } + } + } + } + break; + } + case alp::internal::Stagetype::FOLDL_DIVIDE: // 2 Tensors + case alp::internal::Stagetype::SET_TENSOR: // 2 Tensors + case alp::internal::Stagetype::FOLDL_MAX: // 2 Tensors + case alp::internal::Stagetype::FOLDL_TIMES: // 2 Tensors + case alp::internal::Stagetype::FOLDL_ADD: // 2 Tensors + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor + case alp::internal::Stagetype::FOLDL_EXP: // 1 Tensor + case alp::internal::Stagetype::SET_SCALAR: // 1 Tensor +// default: + break; + } +} + +void alp::internal::Stage::semanticsCheks(){ + + switch ( enum_op_type ) { + +// case alp::internal::Stagetype::APPLY_MINUS: // 4 Tensors + case alp::internal::Stagetype::APPLY_MINUS: // 3 Tensors + case alp::internal::Stagetype::APPLY_ADD: // 3 Tensors + case alp::internal::Stagetype::FOLDL_DIVIDE: // 2 Tensors + case alp::internal::Stagetype::SET_TENSOR: // 2 Tensors + case alp::internal::Stagetype::FOLDL_MAX: // 2 Tensors + case alp::internal::Stagetype::FOLDL_TIMES: // 2 Tensors + case alp::internal::Stagetype::FOLDL_ADD: // 2 Tensors + case alp::internal::Stagetype::FOLDL_EXP: // 1 Tensor + case alp::internal::Stagetype::SET_SCALAR: // 1 Tensor + { + if( internal::invalidAxes( tensor0.getAxes() ) == true ) { + std::cerr << "The axes of the Tensor must not be included in the axes of the forEach." << std::endl; + std::abort(); + } + break; + } + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor + { + //TODO this semantics check cannot be done on the parent tensor + break; + } +// default: + } + + switch ( enum_op_type ) { + +// case alp::internal::Stagetype::APPLY_MINUS: // 4 Tensors + case alp::internal::Stagetype::APPLY_MINUS: // 3 Tensors + case alp::internal::Stagetype::APPLY_ADD: // 3 Tensors + case alp::internal::Stagetype::FOLDL_DIVIDE: // 2 Tensors + case alp::internal::Stagetype::SET_TENSOR: // 2 Tensors + case alp::internal::Stagetype::FOLDL_MAX: // 2 Tensors + case alp::internal::Stagetype::FOLDL_TIMES: // 2 Tensors + case alp::internal::Stagetype::FOLDL_ADD: // 2 Tensors + { + if( internal::invalidAxes( tensor1.getAxes() ) == true ) { + std::cerr << "The axes of the Tensor must not be included in the axes of the forEach." << std::endl; + std::abort(); + } + break; + } + case alp::internal::Stagetype::FOLDL_EXP: // 1 Tensor + case alp::internal::Stagetype::SET_SCALAR: // 1 Tensor + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor + { + //TODO this semantics check cannot be done on the parent tensor + break; + } +// default: + } + + switch ( enum_op_type ) { + +// case alp::internal::Stagetype::APPLY_MINUS: // 4 Tensors + case alp::internal::Stagetype::APPLY_MINUS: // 3 Tensors + case alp::internal::Stagetype::APPLY_ADD: // 3 Tensors + { + if( internal::invalidAxes( tensor2.getAxes() ) == true ) { + std::cerr << "The axes of the Tensor must not be included in the axes of the forEach." << std::endl; + std::abort(); + } + break; + } + case alp::internal::Stagetype::FOLDL_DIVIDE: // 2 Tensors + case alp::internal::Stagetype::SET_TENSOR: // 2 Tensors + case alp::internal::Stagetype::FOLDL_MAX: // 2 Tensors + case alp::internal::Stagetype::FOLDL_TIMES: // 2 Tensors + case alp::internal::Stagetype::FOLDL_ADD: // 2 Tensors + case alp::internal::Stagetype::FOLDL_EXP: // 1 Tensor + case alp::internal::Stagetype::SET_SCALAR: // 1 Tensor + case alp::internal::Stagetype::GET_VIEW: // 1 Tensor + case alp::internal::Stagetype::STORE: // 1 Tensor + case alp::internal::Stagetype::IMPLICIT_FREE: // 1 Tensor + { + //TODO this semantics check cannot be done on the parent tensor + break; + } +// default: + } +} diff --git a/src/graphblas/ascend/symbolTable.cpp b/src/graphblas/ascend/symbolTable.cpp new file mode 100644 index 000000000..68d5996ad --- /dev/null +++ b/src/graphblas/ascend/symbolTable.cpp @@ -0,0 +1,240 @@ +#include +#include +#include +#include + +namespace alp +{ + namespace internal + { + extern iGrid *igrid; + SymbolTable symbols; + } +} + +alp::internal::SymbolTable::SymbolTable() { + + TBuf_decl = false; + temp_scalar_id = 0; +} + +bool alp::internal::SymbolTable::existsTBufTensorDecl() const { + + return TBuf_decl; +} + +void alp::internal::SymbolTable::clearAll() { + + global_tensor_declarations.clear(); + local_tensor_declarations.clear(); + temp_tensor_declarations.clear(); + + // assuming that views are created locally in a forEach + // or is it possible to have views in a global scope? + viewToTensor.clear(); +} + +void alp::internal::SymbolTable::addGlobalTensor( const alp::Tensor &t ) { + + // TODO this semantics check is essentially unnecessary + // since global Tensors are not declared within forEach + if( internal::invalidAxes( t.getAxes() ) == true ) { + std::cerr << "The axes of the global Tensor must not be included in the axes of the forEach." << std::endl; + std::abort(); + } + + global_tensor_declarations.emplace( t.getName() , t ); + + all_global_tensors.emplace_back( t ); +} + +void alp::internal::SymbolTable::addLocalTensor( const alp::Tensor &t ) { + + if( internal::invalidAxes( t.getAxes() ) == true ) { + std::cerr << "The axes of the local Tensor must not be included in the axes of the forEach." << std::endl; + std::abort(); + } + + TBuf_decl = true; + local_tensor_declarations.emplace( t.getName(), t ); + + reuseLocalTempTensorBuffer( t ); +} + +void alp::internal::SymbolTable::addTempTensor( const alp::Tensor &t ) { + + // TODO this semantics check is essentially unnecessary + // since temporary Tensors are declared internally + if( internal::invalidAxes( t.getAxes() ) == true ) { + std::cerr << "The axes of the temporary Tensor must not be included in the axes of the forEach." << std::endl; + std::abort(); + } + + TBuf_decl = true; + temp_tensor_declarations.emplace( t.getName(), t ); + + reuseLocalTempTensorBuffer( t ); +} + +void alp::internal::SymbolTable::addTensorView( const std::string &view_name, const std::string &parent_name ) { + + viewToTensor[ view_name ] = parent_name; +} + +/* +std::string alp::internal::SymbolTable::newTempScalar() { + + return "temp_scalar_" + std::to_string( temp_scalar_id++ ); +} +*/ + +void alp::internal::SymbolTable::addOutputTensor( const alp::Tensor &t ) { + + outputs_global_tensors.emplace_back( t ); +} + +void alp::internal::SymbolTable::printHostLogFile( std::stringstream &listOfGlobalTensors ) { + + bool first = true; + + for( auto it = all_global_tensors.begin(); it != all_global_tensors.end(); ++it ) { + + std::vector< int > axes = it->getAxes(); + for( auto jt = axes.begin(); jt != axes.end(); ++jt ) { + + if( first == true ) { + first = false; + } else { + listOfGlobalTensors << ","; + } + listOfGlobalTensors << *jt; + } + if ( std::find( outputs_global_tensors.cbegin(), outputs_global_tensors.cend(), *it ) == outputs_global_tensors.cend() ) { + listOfGlobalTensors << ",in"; + } else { + listOfGlobalTensors << ",out"; + } + } +} + +void alp::internal::SymbolTable::generateGlobalSymbols( std::stringstream &initFormalParam, + std::stringstream &customFormalParam, std::stringstream &allAccessedArg, + std::stringstream &allTempLocalDecl ) const { + + for( auto it = global_tensor_declarations.cbegin(); it != global_tensor_declarations.cend(); ++it ) { + if( it->first != global_tensor_declarations.cbegin()->first ) { + initFormalParam << ", "; + customFormalParam << ", "; + allAccessedArg << ", "; + } + initFormalParam << "GM_ADDR " << it->first; + // TODO data type needs to be parametrised + // TODO or MAYBE NOT? + customFormalParam << "uint8_t" << " * " << it->first; + allAccessedArg << it->first; + } + + for( auto it = temp_local_buffer_declarations.begin(); it != temp_local_buffer_declarations.end(); ++it ) { + allTempLocalDecl << "\t\t// Declaration of memory used for Local and Temporary tensor\n"; + allTempLocalDecl << "\t\tTBuf< QuePosition::VECCALC > " << it->first << "_temp_local_Buf;\n"; + allTempLocalDecl << "\t\tLocalTensor< " << it->first << " > " << it->first << "_temp_local;\n"; + allTempLocalDecl << "\n"; + } +} + +void alp::internal::SymbolTable::generateTempLocalInit( std::stringstream &allTempLocalInit ) const { + + for( auto it = temp_local_buffer_declarations.begin(); it != temp_local_buffer_declarations.end(); ++it ) { + allTempLocalInit << "\n"; + allTempLocalInit << "\t\t\t// Initialization of memory used for Local and Temporary tensor\n"; + allTempLocalInit << "\t\t\tpipe.InitBuffer( " << it->first + << "_temp_local_Buf, ( totWorkSpaceSize + " << it->second << " ) * sizeof( " << it->first << " ) );\n"; + allTempLocalInit << "\t\t\t" << it->first << "_temp_local = " + << it->first << "_temp_local_Buf.Get< " << it->first << " >();\n"; + } +} + +const alp::Tensor &alp::internal::SymbolTable::getTensorFromView( const alp::Tensor &tensor ) const { + + auto it = viewToTensor.find( tensor.getName() ); + // TODO: assume we have only one level of views, otherwise a loop is required + if( it != viewToTensor.cend() ) { + auto jt = global_tensor_declarations.find( it->second ); + if( jt != global_tensor_declarations.cend() ) { + return jt->second; + } else { + std::cerr << "Cannot handle a view of a non-global declaration" << std::endl; + std::abort(); + } + } else { + return tensor; + } +} + +std::string alp::internal::SymbolTable::getLocalTempTensorBuffer( Datatype type, const std::string &size ) { + + std::string datatype = internal::getDataType( type ); + + auto it = temp_local_buffer_declarations.find( datatype ); + if( it == temp_local_buffer_declarations.cend() ) { + temp_local_buffer_declarations.emplace( datatype, size ); + } else if ( size.empty() == false ) { + it->second.append( std::string( " + " ) + std::string( size ) ); + } + return datatype + "_temp_local"; +} + +void alp::internal::SymbolTable::reuseLocalTempTensorBuffer( const alp::Tensor &t ) { + + std::string datatype = internal::getDataType( t.getType() ); + const std::vector< int > &axes = t.getAxes(); + + assert( axes.size() < 3 ); + + std::string size; + if( axes.size() == 0 ) { + size = "( 32 / sizeof( "; + size.append( datatype ); + size.append( " ) )" ); + } else if( axes.size() == 1 ) { + size = igrid->problemSize( axes[ 0 ] ); + } else if( axes.size() == 2) { + size = igrid->problemSize( axes[ 0 ] ) + " * " + igrid->problemSize( axes[ 1 ] ); + } +/* + auto it = temp_local_buffer_declarations.find( datatype ); + if( it == temp_local_buffer_declarations.cend() ) { + temp_local_buffer_declarations.emplace( datatype, size ); + } else { + it->second.append( std::string( " + " ) + std::string( size ) ); + } +*/ + ( void ) getLocalTempTensorBuffer( t.getType(), size ); +} + +void alp::internal::SymbolTable::debug_print() const { + + std::cerr << "\nGLOBAL: "; + for( auto it = global_tensor_declarations.cbegin(); it != global_tensor_declarations.cend(); ++it ) { + std::cerr << it->first << "(" << alp::internal::getScope( it->second.getScope() ) << "), "; + + } + + std::cerr << "\nLOCAL: "; + for( auto it = local_tensor_declarations.cbegin(); it != local_tensor_declarations.cend(); ++it ) { + std::cerr << it->first << "(" << alp::internal::getScope( it->second.getScope() ) << "), "; + } + + std::cerr << "\nTEMP: "; + for( auto it = temp_tensor_declarations.cbegin(); it != temp_tensor_declarations.cend(); ++it ) { + std::cerr << it->first << "(" << alp::internal::getScope( it->second.getScope() ) << "), "; + } + + std::cerr << "\nVIEW: "; + for( auto it = viewToTensor.cbegin(); it != viewToTensor.cend(); ++it ) { + std::cerr << it->first << "( of " << it->second << "), "; + + } + + std::cerr << std::endl << std::endl << std::endl; +} diff --git a/src/graphblas/ascend/tensor.cpp b/src/graphblas/ascend/tensor.cpp new file mode 100644 index 000000000..ca2d9c264 --- /dev/null +++ b/src/graphblas/ascend/tensor.cpp @@ -0,0 +1,183 @@ +#include +#include +//#include +#include //TODO forEachLevel +#include +#include + + +namespace alp +{ + namespace internal + { + extern SymbolTable symbols; + } +} + +size_t alp::Tensor::tensor_id = 0; + +bool alp::Tensor::operator==( const Tensor &t ) const { + return this->name == t.name; +} + +void alp::Tensor::operator=( const ReductionOperation& op ) { + foldl( *this, op.input, op.opName, op.axes ); +} + +void alp::Tensor::operator=( const ApplyOperation& op ) { + apply( *this, op.input1, op.input2, op.opName, op.axes ); +} + +alp::Tensor::Tensor( const Datatype _type, const std::vector< int > &_axes ) noexcept + : id( tensor_id++ ), + name( std::string("tensor") + std::to_string( id ) ), + type( _type ), + scope( internal::OpGen::forEachLevel > 0 ? internal::Scope::LOCAL : internal::Scope::GLOBAL ), + axes( _axes ) { + if( internal::OpGen::forEachLevel > 0 ) { + internal::symbols.addLocalTensor( *this ); + } else { + /* + for( auto it = axes.begin(); it != axes.end(); ++it ) { + if( it != axes.begin() ) { + internal::OpGen::output_host_log << ","; + } + internal::OpGen::output_host_log << *it; + } + */ + internal::symbols.addGlobalTensor( *this ); + } +} + +alp::Tensor::Tensor( const Tensor&parent, const std::vector< int > &_axes ) noexcept + : id( tensor_id++ ), + name( "view_" + std::to_string( id ) + "_of_" + parent.getName() ), + type( parent.getType() ), + scope( internal::Scope::VIEW ), + axes( _axes ) { + // TODO Is it okay to have a view with empty Axes? + internal::symbols.addTensorView( name, parent.getName() ); +} + +alp::Tensor::Tensor( const Tensor &t ) noexcept + : id( t.id ), + name( t.name ), + type( t.type ), + scope( t.scope ), + axes( t.axes ) { + +} + +alp::Tensor::Tensor( const std::vector< int > &_axes, const Datatype _type ) noexcept + : id( tensor_id++ ), + name( std::string("tensor") + std::to_string( id ) ), + type( _type ), + scope( internal::Scope::TEMP ), + axes( _axes ) { + internal::symbols.addTempTensor( *this ); +} + +size_t alp::Tensor::getID() const { + return id; +} + +const std::string &alp::Tensor::getName() const { + return name; +} + +alp::Datatype alp::Tensor::getType() const { + return type; +} + +alp::internal::Scope alp::Tensor::getScope() const { + return scope; +} + +const std::vector< int > &alp::Tensor::getAxes() const { + return axes; +} + +bool alp::Tensor::isGlobalDecl() const { + + const Tensor tensor = internal::symbols.getTensorFromView( *this ); + + return tensor.scope == internal::Scope::GLOBAL; +} + +bool alp::Tensor::isLocalDecl() const { + return scope == internal::Scope::LOCAL; +} + +bool alp::Tensor::isTempDecl() const { + return scope == internal::Scope::TEMP; +} + +std::string alp::Tensor::getAccessedElement( size_t id ) const { + + // if this tensor is a view, find its parent tensor + const Tensor tensor = internal::symbols.getTensorFromView( *this ); + + // make a decision based on the scope of the parent tensor + switch( tensor.scope ) { + case internal::Scope::GLOBAL: + return "Gm_local_" + tensor.name + "_" + std::to_string( id ); + case internal::Scope::LOCAL: + return internal::getDataType( type ) + "_temp_local[ local_" + tensor.name + "_" + std::to_string( id ) + " ]"; + case internal::Scope::TEMP: + return internal::getDataType( type ) + "_temp_local[ temp_" + tensor.name + "_" + std::to_string( id ) + " ]"; + case internal::Scope::VIEW: + default: + std::cerr << "ERROR in the declaration " << name << " of getAccessedElement" << std::endl; + std::abort(); + break; + } +} + +std::string alp::Tensor::getAscendName( size_t id ) const { + + switch( scope ) { + case internal::Scope::GLOBAL: + return "Gm_local_" + name + "_" + std::to_string( id ); + case internal::Scope::LOCAL: + return "local_" + name + "_" + std::to_string( id ); + case internal::Scope::TEMP: + return "temp_" + name + "_" + std::to_string( id ); + case internal::Scope::VIEW: + default: + std::cerr << "ERROR in the symbol table, the declaration " << name << " was not found" << std::endl; + std::abort(); + } +} + +std::string alp::Tensor::getAscendGlobalName( size_t id ) const { + + switch( scope ) { + case internal::Scope::GLOBAL: + return "Gm_" + name + "_" + std::to_string( id ); + case internal::Scope::LOCAL: + case internal::Scope::TEMP: + case internal::Scope::VIEW: + default: + std::cerr << "ERROR: declaration " << name << " is not global" << std::endl; + std::abort(); + + } +} + +std::string alp::Tensor::getTQueBufName( size_t id ) const { + + switch( scope ) { + case internal::Scope::GLOBAL: + return "globalQue_" + name + "_" + std::to_string( id ); + case internal::Scope::LOCAL: + return "localBuf_" + name + "_" + std::to_string( id ); + case internal::Scope::TEMP: + return "tempBuf_" + name + "_" + std::to_string( id ); + case internal::Scope::VIEW: + default: + std::cerr << "ERROR in the declaration " << name << " of getTQueBufName" << std::endl; + std::abort(); + break; + } +} + diff --git a/src/graphblas/ascend/utils.cpp b/src/graphblas/ascend/utils.cpp new file mode 100644 index 000000000..a0cabcb4b --- /dev/null +++ b/src/graphblas/ascend/utils.cpp @@ -0,0 +1,114 @@ + +/* + * Copyright 2021 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace alp { + + namespace internal { + + std::string getDataType( const Datatype dtype ) { + switch( dtype ) { + case alp::Datatype::FP16: + return "half"; + case alp::Datatype::FP32: + return "single"; + case alp::Datatype::VIEW_TYPE: + return "VIEW_TYPE"; + case alp::Datatype::NO_TYPE: + return "NO_TYPE"; + } + std::cerr << "Unknown datatype: " << (int) dtype << std::endl; + std::abort(); + } + + std::string getScope( const Scope scope ) { + switch( scope ) { + case alp::internal::Scope::GLOBAL: + return "GLOBAL"; + case alp::internal::Scope::LOCAL: + return "LOCAL"; + case alp::internal::Scope::TEMP: + return "TEMP"; + case alp::internal::Scope::VIEW: + return "VIEW"; + } + std::cerr << "Unknown scope: " << (int) scope << std::endl; + std::abort(); + } + + std::vector< int > vectorOfVectorsToVector( const std::vector< std::vector< int > > &vector_of_sets ) { + std::vector< int > vec; + for( auto it = vector_of_sets.begin(); it != vector_of_sets.end(); ++it ) { + for( auto jt = it->begin(); jt != it->end(); ++jt ) { + vec.push_back( *jt ); + } + } + return vec; + } + + std::vector< int > vectorDifference( const std::vector< int > &vector1, const std::vector< int > &vector2 ) { + std::vector< int > diff; + for( auto it = vector1.begin(); it != vector1.end(); ++it ) { + if( std::find( vector2.begin(), vector2.end(), *it ) == std::end( vector2 ) ) { + diff.push_back( *it ); + } + } + return diff; + } + + bool vectorSubset( const std::vector< int > &vector1, const std::vector< int > &vector2 ) { + for( auto it = vector1.begin(); it != vector1.end(); ++it ) { + if( std::find( vector2.begin(), vector2.end(), *it ) == std::end( vector2 ) ) { + return false; + } + } + return true; + } + + std::vector< int > vectorUnion( const std::vector< int > &vector1, const std::vector< int > &vector2 ) { + // create copies for the sorting part below + std::vector< int > v1 = vector1; + std::vector< int > v2 = vector2; + std::vector< int > vec_union; + + // the vectors must be sorted here before using set_union + // but perhaps this is not what we want + // on the other hand is unclear which order to maintain + std::sort( v1.begin(), v1.end() ); + std::sort( v2.begin(), v2.end() ); + + std::set_union( + v1.begin(), + v1.end(), + v2.begin(), + v2.end(), + std::inserter( vec_union, vec_union.end() ) + ); + + return vec_union; + } + + std::vector< int > intArgsToVector( const int arg ) { + std::vector< int > set1; + set1.push_back( arg ); + return set1; + } + } +} diff --git a/src/graphblas/nonblocking/lazy_evaluation.cpp b/src/graphblas/nonblocking/lazy_evaluation.cpp index a78de8933..bc5f23a81 100644 --- a/src/graphblas/nonblocking/lazy_evaluation.cpp +++ b/src/graphblas/nonblocking/lazy_evaluation.cpp @@ -62,15 +62,22 @@ grb::RC LazyEvaluation::addStage( const Pipeline::stage_type &&func, Opcode opcode, const size_t n, const size_t data_type_size, const bool dense_descr, const bool dense_mask, + // TODO FIXME is there really a need for pointers? + const size_t output_vector_id, void * const output_vector_ptr, void * const output_aux_vector_ptr, Coordinates< nonblocking > * const coor_output_ptr, Coordinates< nonblocking > * const coor_output_aux_ptr, + const size_t input_a_id, const size_t input_b_id, + const size_t input_c_id, const size_t input_d_id, + // TODO FIXME is there really a need for pointers? const void * const input_a_ptr, const void * const input_b_ptr, const void * const input_c_ptr, const void * const input_d_ptr, const Coordinates< nonblocking > * const coor_a_ptr, const Coordinates< nonblocking > * const coor_b_ptr, const Coordinates< nonblocking > * const coor_c_ptr, const Coordinates< nonblocking > * const coor_d_ptr, + const size_t input_matrix_id, + // TODO FIXME is there really a need for pointers? const void * const input_matrix ) { RC ret = SUCCESS; @@ -271,6 +278,7 @@ grb::RC LazyEvaluation::addStage( std::vector< Pipeline >::iterator pt = pipelines.begin(); pt != pipelines.end(); pt++ ) { + //std::cerr << "*** tic\n"; DBG if( ( *pt ).empty() ) { empty_pipeline = &( *pt ); @@ -279,14 +287,17 @@ grb::RC LazyEvaluation::addStage( } if( empty_pipeline != nullptr ) { - ( *empty_pipeline).addStage( + //std::cerr << "*** le 1\n"; DBG + ( *empty_pipeline ).addStage( std::move( func ), opcode, n, data_type_size, dense_descr, dense_mask, + output_vector_id, output_vector_ptr, output_aux_vector_ptr, coor_output_ptr, coor_output_aux_ptr, + input_a_id, input_b_id, input_c_id, input_d_id, input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr, coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr, - input_matrix + input_matrix_id, input_matrix ); // we always execute the pipeline when a scalar is returned @@ -296,13 +307,17 @@ grb::RC LazyEvaluation::addStage( } else { Pipeline pipeline; + //std::cerr << "*** le 2\n"; DBG pipeline.addStage( std::move( func ), opcode, n, data_type_size, dense_descr, dense_mask, + output_vector_id, output_vector_ptr, output_aux_vector_ptr, coor_output_ptr, coor_output_aux_ptr, + input_a_id, input_b_id, input_c_id, input_d_id, input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr, coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr, + input_matrix_id, input_matrix ); @@ -321,13 +336,17 @@ grb::RC LazyEvaluation::addStage( // the stage is added in the current pipeline which may be empty if it // overwrites the input of SpMV // it is not necessary to deallocate/release this pipeline + //std::cerr << "*** le 3\n"; DBG ( *ptr ).addStage( std::move( func ), opcode, n, data_type_size, dense_descr, dense_mask, + output_vector_id, output_vector_ptr, output_aux_vector_ptr, coor_output_ptr, coor_output_aux_ptr, + input_a_id, input_b_id, input_c_id, input_d_id, input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr, coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr, + input_matrix_id, input_matrix ); @@ -352,13 +371,17 @@ grb::RC LazyEvaluation::addStage( // the stage is added in the merged pipeline // it is not necessary to deallocate/release this pipeline + // std::cerr << "*** le 4\n"; DBG ( *union_pipeline ).addStage( std::move( func ), opcode, n, data_type_size, dense_descr, dense_mask, + output_vector_id, output_vector_ptr, output_aux_vector_ptr, coor_output_ptr, coor_output_aux_ptr, + input_a_id, input_b_id, input_c_id, input_d_id, input_a_ptr, input_b_ptr, input_c_ptr, input_d_ptr, coor_a_ptr, coor_b_ptr, coor_c_ptr, coor_d_ptr, + input_matrix_id, input_matrix ); diff --git a/src/graphblas/nonblocking/pipeline.cpp b/src/graphblas/nonblocking/pipeline.cpp index 711d65f07..9b0fd82e7 100644 --- a/src/graphblas/nonblocking/pipeline.cpp +++ b/src/graphblas/nonblocking/pipeline.cpp @@ -274,15 +274,22 @@ void Pipeline::addStage( const Pipeline::stage_type &&func, const Opcode opcode, const size_t n, const size_t data_type_size, const bool dense_descr, const bool dense_mask, + const size_t output_vector_id, + // TODO FIXME is there really a need for pointers? void * const output_vector_ptr, void * const output_aux_vector_ptr, Coordinates< nonblocking > * const coor_output_ptr, Coordinates< nonblocking > * const coor_output_aux_ptr, + const size_t input_a_id, const size_t input_b_id, + const size_t input_c_id, const size_t input_d_id, + // TODO FIXME is there really a need for pointers? const void * const input_a_ptr, const void * const input_b_ptr, const void * const input_c_ptr, const void * const input_d_ptr, const Coordinates< nonblocking > * const coor_a_ptr, const Coordinates< nonblocking > * const coor_b_ptr, const Coordinates< nonblocking > * const coor_c_ptr, const Coordinates< nonblocking > * const coor_d_ptr, + const size_t input_matrix_id, + // TODO FIXME is there really a need for pointers? const void * const input_matrix ) { assert( stages.size() != 0 || containers_size == 0); @@ -305,39 +312,50 @@ void Pipeline::addStage( size_of_data_type = data_type_size; } +#ifndef NDEBUG + const size_t num_stage = stages.size(); +#endif stages.push_back( std::move( func ) ); opcodes.push_back( opcode ); + assert( opcodes.size() == num_stage + 1 ); if( output_vector_ptr != nullptr ) { output_vectors.insert( output_vector_ptr ); + stage_output.push_back( output_vector_id ); } if( output_aux_vector_ptr != nullptr ) { output_vectors.insert( output_aux_vector_ptr ); + std::cerr << "Warning: ALP/Ascend does not handle output_aux_vectors yet, please submit a bug report\n"; } // special treatment for an SpMV operation as the input must not be overwritten // by another stage of the pipeline + std::vector< size_t > inputIDs; if( opcode == Opcode::BLAS2_VXM_GENERIC ) { if( input_a_ptr != nullptr ) { input_vectors.insert( input_a_ptr ); vxm_input_vectors.insert( input_a_ptr ); + inputIDs.push_back( input_a_id ); } if( input_b_ptr != nullptr ) { input_vectors.insert( input_b_ptr ); vxm_input_vectors.insert( input_b_ptr ); + inputIDs.push_back( input_b_id ); } if( input_c_ptr != nullptr ) { input_vectors.insert( input_c_ptr ); vxm_input_vectors.insert( input_c_ptr ); + inputIDs.push_back( input_c_id ); } if( input_d_ptr != nullptr ) { input_vectors.insert( input_d_ptr ); vxm_input_vectors.insert( input_d_ptr ); + inputIDs.push_back( input_d_id ); } // in the current implementation that supports level-1 and level-2 operations @@ -346,24 +364,32 @@ void Pipeline::addStage( // moved if( input_matrix != nullptr ) { input_matrices.insert( input_matrix ); + inputIDs.push_back( input_matrix_id ); } } else { if( input_a_ptr != nullptr ) { input_vectors.insert( input_a_ptr ); + inputIDs.push_back( input_a_id ); } if( input_b_ptr != nullptr ) { input_vectors.insert( input_b_ptr ); + inputIDs.push_back( input_b_id ); } if( input_c_ptr != nullptr ) { input_vectors.insert( input_c_ptr ); + inputIDs.push_back( input_c_id ); } if( input_d_ptr != nullptr ) { input_vectors.insert( input_d_ptr ); + inputIDs.push_back( input_d_id ); } } + assert( inputIDs.size() != 0 ); + stage_inputs.push_back( inputIDs ); + assert( stage_inputs.size() + 1 == num_stages ); // update all the sets of the pipeline by adding the entries of the new stage if( coor_a_ptr != nullptr ) { @@ -755,6 +781,8 @@ grb::RC Pipeline::verifyDenseDescriptor() { } grb::RC Pipeline::execution() { + //throw std::runtime_error( "DBG" ); + RC ret = SUCCESS; // if the pipeline is empty, nothing needs to be executed diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d90ca5cdb..1342fde7c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -125,6 +125,39 @@ if( WITH_NONBLOCKING_BACKEND ) endif( WITH_NONBLOCKING_BACKEND ) +if( WITH_ASCEND_BACKEND ) + + assert_defined_targets( backend_shmem_static ) + + ## ascend static + add_library( backend_ascend_static INTERFACE ) + target_link_libraries( backend_ascend_static INTERFACE backend_shmem_static ) + target_link_libraries( backend_ascend_static INTERFACE backend_ascend_headers ) + target_compile_definitions( backend_ascend_static INTERFACE "${ASCEND_SELECTION_DEFS}" ) + + install( TARGETS backend_ascend_static + EXPORT GraphBLASTargets + ARCHIVE DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}" + ) + + ## ascend shared + add_library( backend_ascend_shared INTERFACE ) + target_link_libraries( backend_ascend_shared INTERFACE backend_shmem_shared ) + target_link_libraries( backend_ascend_shared INTERFACE backend_ascend_headers ) + target_compile_definitions( backend_ascend_shared INTERFACE "${ASCEND_SELECTION_DEFS}" ) + + install( TARGETS backend_ascend_shared + EXPORT GraphBLASTargets + LIBRARY DESTINATION "${SHMEM_BACKEND_INSTALL_DIR}" + ) + + # this is an alias for add_grb_executables() to select the backend to link against + # DO NOT CHANGE THE ALIAS NAME! + add_library( "${ASCEND_BACKEND_DEFAULT_NAME}" ALIAS backend_ascend_static ) + +endif( WITH_ASCEND_BACKEND ) + + # library with utilities for tests, to be used optionally # i.e. NOT linked by default add_library( test_utils_headers INTERFACE ) diff --git a/tests/unit/asc_analytic_model.cpp b/tests/unit/asc_analytic_model.cpp new file mode 100644 index 000000000..404ff26cb --- /dev/null +++ b/tests/unit/asc_analytic_model.cpp @@ -0,0 +1,184 @@ + +#include "analytic_model.hpp" + +#include +#include + + +int main() { + { + // this is a 1D problem over 10 cores and 1M problem size, with a fictional + // cache size of 5000 bytes + asc::AnalyticModel< 1, 1, false > am( 5000, {10}, {1000000}, {true} ); + // cannot test minor tensors for 1D problems (TODO test elsewhere) + // add three float vectors + am.addGlobalTensor( 4, {true} ); + am.addGlobalTensor( 4, {true} ); + am.addGlobalTensor( 4, {true} ); + // suppose we just add them + am.setNumStages( 1 ); + // this problem should be feasible: + // - every processing unit gets 100000 elements per vector + // - their byte size is 400000 per vector + // - there are three vectors of size 1200000 bytes total + // - block size that maximises reuse is 5000 / 12 = 416 + try { + const size_t bsize = am.getBlockSize( 0 ); + std::cout << "Test case 1: suggested block size is " << bsize << ", "; + if( bsize != 416 ) { + std::cout << "x\n"; + std::ostringstream oss; + oss << "Expected block size 416, got " << bsize << " instead"; + throw std::runtime_error( oss.str() ); + } else { + std::cout << "v\n"; + } + } catch( const std::exception &e ) { + std::cerr << "Error during test case 1: " << e.what() << std::endl; + std::cout << "Test FAILED\n" << std::endl; + return 10; + } + } + { + // this is a 1D problem over a 2D 2 x 5 process mesh with otherwise the same + // test parameters as the above test + asc::AnalyticModel< 2, 1, false > am( 5000, {2,5}, {1000000}, {true} ); + am.addGlobalTensor( 4, {true} ); + am.addGlobalTensor( 4, {true} ); + am.addGlobalTensor( 4, {true} ); + am.setNumStages( 1 ); + try { + const size_t bsize = am.getBlockSize( 0 ); + std::cout << "Test case 2: suggested block size is " << bsize << ", "; + if( bsize != 416 ) { + std::cout << "x\n"; + std::ostringstream oss; + oss << "Expected block size 416, got " << bsize << " instead"; + throw std::runtime_error( oss.str() ); + } else { + std::cout << "v\n"; + } + } catch( const std::exception &e ) { + std::cerr << "Error during test case 2: " << e.what() << std::endl; + std::cout << "Test FAILED\n" << std::endl; + return 20; + } + } + { + // this is a 1D problem over a 5D 1 x 1 x 1 x 2 x 5 process mesh with + // otherwise the same test parameters as the above test + asc::AnalyticModel< 5, 1, false > am( 5000, {1,1,1,2,5}, {1000000}, {true} ); + am.addGlobalTensor( 4, {true} ); + am.addGlobalTensor( 4, {true} ); + am.addGlobalTensor( 4, {true} ); + am.setNumStages( 1 ); + try { + const size_t bsize = am.getBlockSize( 0 ); + std::cout << "Test case 3: suggested block size is " << bsize << ", "; + if( bsize != 416 ) { + std::cout << "x\n"; + std::ostringstream oss; + oss << "Expected block size 416, got " << bsize << " instead"; + throw std::runtime_error( oss.str() ); + } else { + std::cout << "v\n"; + } + } catch( const std::exception &e ) { + std::cerr << "Error during test case 3: " << e.what() << std::endl; + std::cout << "Test FAILED\n" << std::endl; + return 30; + } + } + { + // test a 1D case where a trivial solution is possible + asc::AnalyticModel< 1, 1, false > am( 24000, {10}, {10000}, {true} ); + am.addGlobalTensor( 8, {true} ); + am.addGlobalTensor( 8, {true} ); + am.setNumStages( 1 ); + try { + const size_t bsize = am.getBlockSize( 0 ); + std::cout << "Test case 4: suggested block size is " << bsize << ", "; + if( bsize != 1000 ) { + std::cout << "x\n"; + std::ostringstream oss; + oss << "Expected block size 10000, got " << bsize << " instead"; + throw std::runtime_error( oss.str() ); + } else { + std::cout << "v\n"; + } + } catch( const std::exception &e ) { + std::cerr << "Error during test case 4: " << e.what() << std::endl; + std::cout << "Test FAILED\n" << std::endl; + return 40; + } + } + { + // test a 1D case where a trivial solution is possible + asc::AnalyticModel< 1, 1, false > am( 3003, {1}, {1001}, {true} ); + am.addGlobalTensor( 3, {true} ); + am.setNumStages( 1 ); + try { + const size_t bsize = am.getBlockSize( 0 ); + std::cout << "Test case 5: suggested block size is " << bsize << ", "; + if( bsize != 1001 ) { + std::cout << "x\n"; + std::ostringstream oss; + oss << "Expected block size 1001, got " << bsize << " instead"; + throw std::runtime_error( oss.str() ); + } else { + std::cout << "v\n"; + } + } catch( const std::exception &e ) { + std::cerr << "Error during test case 5: " << e.what() << std::endl; + std::cout << "Test FAILED\n" << std::endl; + return 50; + } + } + { + // test for the other trivial (worst-case trivial) solution, 1D + asc::AnalyticModel< 1, 1, false > am( 32, {8}, {2538791}, {true} ); + am.addGlobalTensor( 8, {true} ); + am.addGlobalTensor( 8, {true} ); + am.addGlobalTensor( 8, {true} ); + am.addGlobalTensor( 8, {true} ); + am.setNumStages( 2 ); + try { + const size_t bsize = am.getBlockSize( 0 ); + std::cout << "Test case 6: suggested block size is " << bsize << ", "; + if( bsize != 1 ) { + std::cout << "x\n"; + std::ostringstream oss; + oss << "Expected block size 1, got " << bsize << " instead"; + throw std::runtime_error( oss.str() ); + } else { + std::cout << "v\n"; + } + } catch( const std::exception &e ) { + std::cerr << "Error during test case 6: " << e.what() << std::endl; + std::cout << "Test FAILED\n" << std::endl; + return 60; + } + } + { + // test with no feasible solution, 1D + asc::AnalyticModel< 1, 1, false > am( 1, {8}, {2538791}, {true} ); + am.addGlobalTensor( 1, {true} ); + am.addGlobalTensor( 1, {true} ); + am.setNumStages( 1 ); + try { + const size_t bsize = am.getBlockSize( 0 ); + std::cout << "Test case 7: suggested block size is " << bsize << ", x\n"; + std::cerr << "Error during test case 7: a blocksize was returned even " + << "though the problem is infeasible" << std::endl; + std::cout << "Test FAILED\n" << std::endl; + return 70; + } catch( ... ) { + std::cout << "Test case 7: infeasibility correctly detected\n"; + } + } + + // done + std::cout << "Test OK\n" << std::endl; + return 0; +} +