diff --git a/.gitmodules b/.gitmodules index f19f40a5..c074c99d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,7 @@ [submodule "3rdparty/oneDNN"] path = 3rdparty/oneDNN url = https://github.com/uxlfoundation/oneDNN +[submodule "3rdparty/kokkos"] + path = 3rdparty/kokkos + url = https://github.com/kokkos/kokkos.git + branch = release-candidate-5.0.1 diff --git a/3rdparty/kokkos b/3rdparty/kokkos new file mode 160000 index 00000000..f5723022 --- /dev/null +++ b/3rdparty/kokkos @@ -0,0 +1 @@ +Subproject commit f572302292068a22672ab0012adaeeee0596fcea diff --git a/CMakeLists.txt b/CMakeLists.txt index 67d60660..6cb6861d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ if(ENABLE_STATISTIC_WEIGHTS) add_definitions(-DENABLE_STATISTIC_WEIGHTS) endif() -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) enable_testing() @@ -43,6 +43,28 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") add_subdirectory(3rdparty) include(cmake/opencv_config.cmake) +include(cmake/kokkos_config.cmake) + +include_directories("${KOKKOS_INSTALL_DIR}/include") + +add_library(Kokkos_imported INTERFACE) +add_dependencies(Kokkos_imported kokkos_external) + +target_include_directories(Kokkos_imported INTERFACE + "${KOKKOS_INSTALL_DIR}/include" +) + +target_link_directories(Kokkos_imported INTERFACE + "${KOKKOS_INSTALL_DIR}/lib" +) + + +target_link_libraries(Kokkos_imported INTERFACE kokkoscore kokkoscontainers) + + +if(MSVC) + add_compile_options(/wd4267 /wd4244 /wd4127 /wd4324) +endif() if (NOT WIN32) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror") diff --git a/app/Accuracy/CMakeLists.txt b/app/Accuracy/CMakeLists.txt index b4010c21..ae3deee3 100644 --- a/app/Accuracy/CMakeLists.txt +++ b/app/Accuracy/CMakeLists.txt @@ -10,6 +10,7 @@ target_link_libraries( ACCLib ${OpenCV_LIBS} ) target_link_libraries( ACCLib TBB_unified) target_link_libraries( ACCLib layers_lib) target_link_libraries( ACCLib gtest_main) +target_link_libraries( ACCLib Kokkos_imported) add_executable(Accuracy_Check accuracy_check.cpp) target_link_libraries(Accuracy_Check ACCLib) diff --git a/app/Graph/build.cpp b/app/Graph/build.cpp index 9a4d1cd0..c4ee7fe9 100644 --- a/app/Graph/build.cpp +++ b/app/Graph/build.cpp @@ -194,21 +194,23 @@ void build_graph(it_lab_ai::Graph& graph, it_lab_ai::Tensor& input, } try { - std::sort( - connection_list.begin(), connection_list.end(), - [&](const auto& a, const auto& b) { - if (!name_to_layer.count(a.first) || !name_to_layer.count(b.first)) { - return false; - } - return name_to_layer[a.first]->getID() < - name_to_layer[b.first]->getID(); - }); + std::sort(connection_list.begin(), connection_list.end(), + [&](const auto& a, const auto& b) { + if (!name_to_layer.contains(a.first) || + !name_to_layer.contains(b.first)) { + return false; + } + return name_to_layer[a.first]->getID() < + name_to_layer[b.first]->getID(); + }); + } catch (const std::exception& e) { std::cerr << "ERROR during sorting: " << e.what() << '\n'; } for (const auto& [source_name, target_name] : connection_list) { - if (name_to_layer.count(source_name) && name_to_layer.count(target_name)) { + if (name_to_layer.contains(source_name) && + name_to_layer.contains(target_name)) { if (target_name.find("Concat") != std::string::npos || name_to_layer[target_name]->getName() == it_lab_ai::kConcat) { if (concat_connections.find(target_name) != concat_connections.end()) { @@ -532,7 +534,7 @@ ParseResult parse_json_model(RuntimeOptions options, std::string constant_name = inputs[1].get(); constant_name = get_base_layer_name(constant_name); - if (layer_parameters.count(constant_name)) { + if (layer_parameters.contains(constant_name)) { splits = layer_parameters[constant_name]; } else if (constant_name.find("onnx::") != std::string::npos) { splits = last_constant_value; @@ -735,7 +737,7 @@ ParseResult parse_json_model(RuntimeOptions options, std::string constant_name = inputs[1].get(); constant_name = get_base_layer_name(constant_name); - if (layer_parameters.count(constant_name)) { + if (layer_parameters.contains(constant_name)) { shape = layer_parameters[constant_name]; } } @@ -797,7 +799,7 @@ ParseResult parse_json_model(RuntimeOptions options, std::string constant_name = inputs[1].get(); constant_name = get_base_layer_name(constant_name); - if (layer_parameters.count(constant_name)) { + if (layer_parameters.contains(constant_name)) { axes = layer_parameters[constant_name]; } else if (constant_name.find("onnx::") != std::string::npos) { axes = last_constant_value; diff --git a/cmake/kokkos_config.cmake b/cmake/kokkos_config.cmake new file mode 100644 index 00000000..fecbe7e6 --- /dev/null +++ b/cmake/kokkos_config.cmake @@ -0,0 +1,39 @@ +include(ExternalProject) + +set(KOKKOS_BUILD_DIR "${CMAKE_BINARY_DIR}/3rdparty/kokkos_build") +set(KOKKOS_INSTALL_DIR "${CMAKE_BINARY_DIR}/3rdparty/kokkos_install") + +ExternalProject_Add( + kokkos_external + SOURCE_DIR "${CMAKE_SOURCE_DIR}/3rdparty/kokkos" + BINARY_DIR "${KOKKOS_BUILD_DIR}" + INSTALL_DIR "${KOKKOS_INSTALL_DIR}" + + CMAKE_ARGS + -G "${CMAKE_GENERATOR}" + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_DIR} + + -DKokkos_ENABLE_SERIAL=ON + -DKokkos_ARCH_NATIVE=OFF + -DKokkos_ENABLE_OPENMP=OFF + -DKokkos_ENABLE_THREADS=ON + -DKokkos_ENABLE_CUDA=OFF + -DKokkos_ENABLE_HIP=OFF + -DKokkos_ENABLE_TESTS=OFF + -DKokkos_ENABLE_EXAMPLES=OFF + + -DKokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON + -DKokkos_ENABLE_LIBDL=OFF + + BUILD_COMMAND ${CMAKE_COMMAND} --build "${KOKKOS_BUILD_DIR}" --config ${CMAKE_BUILD_TYPE} -j${NPROC} + + INSTALL_COMMAND ${CMAKE_COMMAND} --install "${KOKKOS_BUILD_DIR}" --config ${CMAKE_BUILD_TYPE} + + BUILD_ALWAYS OFF + LOG_CONFIGURE ON + LOG_BUILD ON + LOG_INSTALL ON +) + +set(Kokkos_DIR "${KOKKOS_INSTALL_DIR}/lib/cmake/Kokkos" CACHE PATH "Path to Kokkos CMake config") diff --git a/include/parallel/backends.hpp b/include/parallel/backends.hpp index b63d128c..694fdc32 100644 --- a/include/parallel/backends.hpp +++ b/include/parallel/backends.hpp @@ -3,6 +3,8 @@ #include #include +// NOLINTNEXTLINE(misc-header-include-cycle) +#include #include #include #include @@ -17,7 +19,8 @@ enum class Backend : std::uint8_t { kSeq = 0, kThreads = 1, kTbb = 2, - kOmp = 3 + kOmp = 3, + kKokkos = 4 }; struct Options { @@ -116,5 +119,28 @@ inline void impl_omp(std::size_t count, } #endif +inline void impl_kokkos(std::size_t count, + const std::function& func, + const Options& opt) { + if (count == 0) return; + static std::once_flag init_flag; + std::call_once(init_flag, [&opt]() { + int num_threads = + opt.max_threads > 0 + ? opt.max_threads + : static_cast(std::thread::hardware_concurrency()); + + Kokkos::InitializationSettings args; + args.set_num_threads(num_threads); + Kokkos::initialize(args); + + std::atexit([]() { Kokkos::finalize(); }); + }); + + auto kokkos_func = [&func](const std::size_t i) { func(i); }; + Kokkos::parallel_for("parallel_for", count, kokkos_func); + Kokkos::fence(); +} + } // namespace parallel } // namespace it_lab_ai diff --git a/include/parallel/parallel.hpp b/include/parallel/parallel.hpp index 5232dcae..5834aeba 100644 --- a/include/parallel/parallel.hpp +++ b/include/parallel/parallel.hpp @@ -29,7 +29,8 @@ inline Backend select_backend(const Options& opt, std::size_t n) { } if (opt.backend == Backend::kSeq || opt.backend == Backend::kThreads || - opt.backend == Backend::kTbb || opt.backend == Backend::kOmp) { + opt.backend == Backend::kTbb || opt.backend == Backend::kOmp || + opt.backend == Backend::kKokkos) { return opt.backend; } @@ -56,6 +57,9 @@ inline void parallel_for(std::size_t count, Func&& func, case Backend::kOmp: impl_omp(count, std::forward(func), opt); break; + case Backend::kKokkos: + impl_kokkos(count, std::forward(func), opt); + break; } } diff --git a/src/graph/CMakeLists.txt b/src/graph/CMakeLists.txt index 9054e8a2..b84b830d 100644 --- a/src/graph/CMakeLists.txt +++ b/src/graph/CMakeLists.txt @@ -1,3 +1,4 @@ file(GLOB_RECURSE graph_src *.cpp) add_library(graph_lib STATIC "${GRAPH_HEADERS}" "${graph_src}") target_link_libraries(graph_lib PUBLIC TBB_unified) +add_dependencies(graph_lib kokkos_external) diff --git a/src/graph_transformations/CMakeLists.txt b/src/graph_transformations/CMakeLists.txt index 6942b48f..08050170 100644 --- a/src/graph_transformations/CMakeLists.txt +++ b/src/graph_transformations/CMakeLists.txt @@ -1,3 +1,4 @@ file(GLOB_RECURSE graphT_src *.cpp) add_library(graphT_lib STATIC "${GRAPHT_HEADERS}" "${graphT_src}") target_link_libraries(graphT_lib PUBLIC TBB_unified) +add_dependencies(graphT_lib kokkos_external) diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt index e441507f..35382404 100644 --- a/src/layers/CMakeLists.txt +++ b/src/layers/CMakeLists.txt @@ -4,3 +4,4 @@ add_library(layers_lib STATIC "${LAYERS_HEADERS}" "${layers_src}") target_link_libraries(layers_lib PUBLIC TBB_unified) target_link_libraries(layers_lib PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(layers_lib PUBLIC dnnl) +target_link_libraries(layers_lib PUBLIC Kokkos_imported) diff --git a/src/layers_oneDNN/CMakeLists.txt b/src/layers_oneDNN/CMakeLists.txt index e4ee067e..23763d82 100644 --- a/src/layers_oneDNN/CMakeLists.txt +++ b/src/layers_oneDNN/CMakeLists.txt @@ -1,6 +1,7 @@ file(GLOB_RECURSE layers_oneDNN_src *.cpp) add_library(layers_oneDNN_lib STATIC "${LAYERS_ONEDNN_HEADERS}" "${layers_oneDNN_src}") target_link_libraries(layers_oneDNN_lib PUBLIC dnnl TBB_unified) +add_dependencies(layers_oneDNN_lib kokkos_external) target_include_directories(layers_oneDNN_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include ) diff --git a/test/single_layer/test_ewlayer.cpp b/test/single_layer/test_ewlayer.cpp index 0174fbf9..e925a310 100644 --- a/test/single_layer/test_ewlayer.cpp +++ b/test/single_layer/test_ewlayer.cpp @@ -216,181 +216,3 @@ TEST(ewlayer, new_ewlayer_can_sigmoid_float_extreme_values) { EXPECT_NEAR((*out[0].as())[i], expected_output[i], 1e-5F); } } - -TEST(ewlayer, parallel_for_ew) { - EWLayer layer("relu"); - - std::vector vec(8000000, -1); - Tensor input = make_tensor(vec); - Tensor output; - std::vector in{input}; - std::vector out{output}; - - std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, - ParBackend::kTbb, ParBackend::kOmp}; - - for (auto backend : backends) { - RuntimeOptions options; - options.setParallelBackend(backend); - - auto start = std::chrono::high_resolution_clock::now(); - layer.run(in, out, options); - auto end = std::chrono::high_resolution_clock::now(); - auto duration = - std::chrono::duration_cast(end - start); - std::cout << " time: " << duration.count() << " ms" << std::endl; - for (size_t i = 0; i < 8000000; i++) { - EXPECT_EQ((*out[0].as())[i], 0); - } - } -} - -TEST(ewlayer, parallel_for_ew_sigmoid_compact) { - EWLayer layer("sigmoid"); - - std::vector vec(8000000, -1); - Tensor input = make_tensor(vec); - Tensor output; - std::vector in{input}; - std::vector out{output}; - - std::vector> backends = { - {ParBackend::kSeq, "Sequential"}, - {ParBackend::kThreads, "Threads"}, - {ParBackend::kTbb, "TBB"}, - {ParBackend::kOmp, "OpenMP"}}; - - std::vector reference_result; - bool first = true; - - for (const auto& [backend, name] : backends) { - RuntimeOptions options; - options.parallel = (backend != ParBackend::kSeq); - options.par_backend = backend; - if (backend == ParBackend::kThreads) { - options.threads = 4; - } - - auto start = std::chrono::high_resolution_clock::now(); - layer.run(in, out, options); - auto end = std::chrono::high_resolution_clock::now(); - auto duration = - std::chrono::duration_cast(end - start); - - std::cout << "Sigmoid " << name << " time: " << duration.count() << " ms" - << std::endl; - - auto current_result = *out[0].as(); - if (first) { - reference_result = current_result; - first = false; - for (size_t i = 0; i < 100; i++) { - EXPECT_EQ(current_result[i], 0) - << "Invalid sigmoid result at index " << i; - } - } else { - for (size_t i = 0; i < reference_result.size(); i++) { - EXPECT_EQ(current_result[i], reference_result[i]) - << "Mismatch with " << name << " at index " << i; - } - } - } -} - -TEST(ewlayer, parallel_for_direct) { - const int SIZE = 2000; - std::vector matrix1(SIZE * SIZE); - std::vector matrix2(SIZE * SIZE); - std::vector result(SIZE * SIZE); - - for (int i = 0; i < SIZE * SIZE; ++i) { - matrix1[i] = 1; - matrix2[i] = 1; - } - - auto start = std::chrono::high_resolution_clock::now(); - parallel::parallel_for( - SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, - ParBackend::kSeq); - - auto end = std::chrono::high_resolution_clock::now(); - auto total_duration = - std::chrono::duration_cast(end - start); - - for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); - - start = std::chrono::high_resolution_clock::now(); - parallel::parallel_for( - SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, - ParBackend::kThreads); - end = std::chrono::high_resolution_clock::now(); - total_duration = - std::chrono::duration_cast(end - start); - for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); - - start = std::chrono::high_resolution_clock::now(); - parallel::parallel_for( - SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, - ParBackend::kTbb); - end = std::chrono::high_resolution_clock::now(); - total_duration = - std::chrono::duration_cast(end - start); - for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); - - start = std::chrono::high_resolution_clock::now(); - parallel::parallel_for( - SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, - ParBackend::kOmp); - end = std::chrono::high_resolution_clock::now(); - total_duration = - std::chrono::duration_cast(end - start); - for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); -} - -TEST(ewlayer, parallel_for_notmatrix) { - const int SIZE = 3000; - std::vector matrix1(SIZE * SIZE); - std::vector result(SIZE * SIZE); - - for (int i = 0; i < SIZE * SIZE; ++i) { - matrix1[i] = 1; - } - - auto start = std::chrono::high_resolution_clock::now(); - parallel::parallel_for( - SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, - ParBackend::kSeq); - - auto end = std::chrono::high_resolution_clock::now(); - auto total_duration = - std::chrono::duration_cast(end - start); - - for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); - - start = std::chrono::high_resolution_clock::now(); - parallel::parallel_for( - SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, - ParBackend::kThreads); - end = std::chrono::high_resolution_clock::now(); - total_duration = - std::chrono::duration_cast(end - start); - for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); - - start = std::chrono::high_resolution_clock::now(); - parallel::parallel_for( - SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, - ParBackend::kTbb); - end = std::chrono::high_resolution_clock::now(); - total_duration = - std::chrono::duration_cast(end - start); - for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); - - start = std::chrono::high_resolution_clock::now(); - parallel::parallel_for( - SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, - ParBackend::kOmp); - end = std::chrono::high_resolution_clock::now(); - total_duration = - std::chrono::duration_cast(end - start); - for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); -} diff --git a/test/single_layer_parall_version/test_ewlayer_parall.cpp b/test/single_layer_parall_version/test_ewlayer_parall.cpp new file mode 100644 index 00000000..deb4660a --- /dev/null +++ b/test/single_layer_parall_version/test_ewlayer_parall.cpp @@ -0,0 +1,260 @@ +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "layers/EWLayer.hpp" + +#define ENABLE_TIMING_OUTPUT 1 + +#if ENABLE_TIMING_OUTPUT +#define PRINT_TIMING(msg) std::cout << msg << std::endl +#else +#define PRINT_TIMING(msg) ((void)0) +#endif + +using namespace it_lab_ai; + +TEST(ewlayer_parall, parallel_for_ew_relu) { + EWLayer layer("relu"); + + std::vector vec(8000000, -1); + Tensor input = make_tensor(vec); + Tensor output; + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << duration.count() << " ms"); + for (size_t i = 0; i < 8000000; i++) { + EXPECT_EQ((*out[0].as())[i], 0); + } + } +} + +TEST(ewlayer_parall, parallel_for_sigmoid) { + EWLayer layer("sigmoid"); + + std::vector vec(8000000, -1); + Tensor input = make_tensor(vec); + Tensor output; + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << duration.count() << " ms"); + for (size_t i = 0; i < 8000000; i++) { + EXPECT_EQ((*out[0].as())[i], 0); + } + } +} + +TEST(ewlayer_parall, parallel_for_minus) { + EWLayer layer("minus"); + + std::vector vec(8000000, -1); + Tensor input = make_tensor(vec); + Tensor output; + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << duration.count() << " ms"); + for (size_t i = 0; i < 8000000; i++) { + EXPECT_EQ((*out[0].as())[i], 1); + } + } +} + +TEST(ewlayer_parall, parallel_for_linear) { + EWLayer layer("linear", 2.0F, 2.0F); + + std::vector vec(8000000, -1); + Tensor input = make_tensor(vec); + Tensor output; + std::vector in{input}; + std::vector out{output}; + + std::vector backends = {ParBackend::kSeq, ParBackend::kThreads, + ParBackend::kTbb, ParBackend::kOmp, + ParBackend::kKokkos}; + + for (auto backend : backends) { + RuntimeOptions options; + options.setParallelBackend(backend); + + auto start = std::chrono::high_resolution_clock::now(); + layer.run(in, out, options); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << duration.count() << " ms"); + for (size_t i = 0; i < 8000000; i++) { + EXPECT_EQ((*out[0].as())[i], 0); + } + } +} + +TEST(ewlayer_parall, parallel_for_direct) { + const int SIZE = 2000; + std::vector matrix1(SIZE * SIZE); + std::vector matrix2(SIZE * SIZE); + std::vector result(SIZE * SIZE); + + for (int i = 0; i < SIZE * SIZE; ++i) { + matrix1[i] = 1; + matrix2[i] = 1; + } + + auto start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, + ParBackend::kSeq); + + auto end = std::chrono::high_resolution_clock::now(); + auto total_duration = + std::chrono::duration_cast(end - start); + + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, + ParBackend::kThreads); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, + ParBackend::kTbb); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, + ParBackend::kOmp); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, + ParBackend::kKokkos); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); +} + +TEST(ewlayer_parall, parallel_for_notmatrix) { + const int SIZE = 3000; + std::vector matrix1(SIZE * SIZE); + std::vector result(SIZE * SIZE); + + for (int i = 0; i < SIZE * SIZE; ++i) { + matrix1[i] = 1; + } + + auto start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, + ParBackend::kSeq); + + auto end = std::chrono::high_resolution_clock::now(); + auto total_duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, + ParBackend::kThreads); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, + ParBackend::kTbb); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, + ParBackend::kOmp); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, + ParBackend::kKokkos); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + PRINT_TIMING(" time: " << total_duration.count() << " ms"); + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); +}