diff --git a/.gitmodules b/.gitmodules
index f19f40a5..c074c99d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,7 @@
 [submodule "3rdparty/oneDNN"]
 	path = 3rdparty/oneDNN
 	url = https://github.com/uxlfoundation/oneDNN
+[submodule "3rdparty/kokkos"]
+	path = 3rdparty/kokkos
+	url = https://github.com/kokkos/kokkos.git
+	branch = release-candidate-5.0.1
diff --git a/3rdparty/kokkos b/3rdparty/kokkos
new file mode 160000
index 00000000..f5723022
--- /dev/null
+++ b/3rdparty/kokkos
@@ -0,0 +1 @@
+Subproject commit f572302292068a22672ab0012adaeeee0596fcea
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67d60660..6cb6861d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ if(ENABLE_STATISTIC_WEIGHTS)
     add_definitions(-DENABLE_STATISTIC_WEIGHTS)
 endif()
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 
 enable_testing()
 
@@ -43,6 +43,28 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 add_subdirectory(3rdparty)
 
 include(cmake/opencv_config.cmake)
+include(cmake/kokkos_config.cmake)
+
+include_directories("${KOKKOS_INSTALL_DIR}/include")
+
+add_library(Kokkos_imported INTERFACE)
+add_dependencies(Kokkos_imported kokkos_external)
+
+target_include_directories(Kokkos_imported INTERFACE 
+    "${KOKKOS_INSTALL_DIR}/include"
+)
+
+target_link_directories(Kokkos_imported INTERFACE 
+    "${KOKKOS_INSTALL_DIR}/lib"
+)
+
+
+target_link_libraries(Kokkos_imported INTERFACE kokkoscore kokkoscontainers)
+
+
+if(MSVC)
+    add_compile_options(/wd4267 /wd4244 /wd4127 /wd4324)
+endif()
 
 if (NOT WIN32)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror")
diff --git a/app/Accuracy/CMakeLists.txt b/app/Accuracy/CMakeLists.txt
index b4010c21..ae3deee3 100644
--- a/app/Accuracy/CMakeLists.txt
+++ b/app/Accuracy/CMakeLists.txt
@@ -10,6 +10,7 @@ target_link_libraries( ACCLib ${OpenCV_LIBS} )
 target_link_libraries( ACCLib TBB_unified)
 target_link_libraries( ACCLib layers_lib)
 target_link_libraries( ACCLib gtest_main)
+target_link_libraries( ACCLib Kokkos_imported)
 
 add_executable(Accuracy_Check accuracy_check.cpp)
 target_link_libraries(Accuracy_Check ACCLib)
diff --git a/app/Graph/build.cpp b/app/Graph/build.cpp
index 9a4d1cd0..c4ee7fe9 100644
--- a/app/Graph/build.cpp
+++ b/app/Graph/build.cpp
@@ -194,21 +194,23 @@ void build_graph(it_lab_ai::Graph& graph, it_lab_ai::Tensor& input,
   }
 
   try {
-    std::sort(
-        connection_list.begin(), connection_list.end(),
-        [&](const auto& a, const auto& b) {
-          if (!name_to_layer.count(a.first) || !name_to_layer.count(b.first)) {
-            return false;
-          }
-          return name_to_layer[a.first]->getID() <
-                 name_to_layer[b.first]->getID();
-        });
+    std::sort(connection_list.begin(), connection_list.end(),
+              [&](const auto& a, const auto& b) {
+                if (!name_to_layer.contains(a.first) ||
+                    !name_to_layer.contains(b.first)) {
+                  return false;
+                }
+                return name_to_layer[a.first]->getID() <
+                       name_to_layer[b.first]->getID();
+              });
+
   } catch (const std::exception& e) {
     std::cerr << "ERROR during sorting: " << e.what() << '\n';
   }
 
   for (const auto& [source_name, target_name] : connection_list) {
-    if (name_to_layer.count(source_name) && name_to_layer.count(target_name)) {
+    if (name_to_layer.contains(source_name) &&
+        name_to_layer.contains(target_name)) {
       if (target_name.find("Concat") != std::string::npos ||
           name_to_layer[target_name]->getName() == it_lab_ai::kConcat) {
         if (concat_connections.find(target_name) != concat_connections.end()) {
@@ -532,7 +534,7 @@ ParseResult parse_json_model(RuntimeOptions options,
             std::string constant_name = inputs[1].get<std::string>();
             constant_name = get_base_layer_name(constant_name);
 
-            if (layer_parameters.count(constant_name)) {
+            if (layer_parameters.contains(constant_name)) {
               splits = layer_parameters[constant_name];
             } else if (constant_name.find("onnx::") != std::string::npos) {
               splits = last_constant_value;
@@ -735,7 +737,7 @@ ParseResult parse_json_model(RuntimeOptions options,
             std::string constant_name = inputs[1].get<std::string>();
             constant_name = get_base_layer_name(constant_name);
 
-            if (layer_parameters.count(constant_name)) {
+            if (layer_parameters.contains(constant_name)) {
               shape = layer_parameters[constant_name];
             }
           }
@@ -797,7 +799,7 @@ ParseResult parse_json_model(RuntimeOptions options,
             std::string constant_name = inputs[1].get<std::string>();
             constant_name = get_base_layer_name(constant_name);
 
-            if (layer_parameters.count(constant_name)) {
+            if (layer_parameters.contains(constant_name)) {
               axes = layer_parameters[constant_name];
             } else if (constant_name.find("onnx::") != std::string::npos) {
               axes = last_constant_value;
diff --git a/cmake/kokkos_config.cmake b/cmake/kokkos_config.cmake
new file mode 100644
index 00000000..fecbe7e6
--- /dev/null
+++ b/cmake/kokkos_config.cmake
@@ -0,0 +1,39 @@
+include(ExternalProject)
+
+set(KOKKOS_BUILD_DIR "${CMAKE_BINARY_DIR}/3rdparty/kokkos_build")
+set(KOKKOS_INSTALL_DIR "${CMAKE_BINARY_DIR}/3rdparty/kokkos_install")
+
+ExternalProject_Add(
+    kokkos_external
+    SOURCE_DIR "${CMAKE_SOURCE_DIR}/3rdparty/kokkos"
+    BINARY_DIR "${KOKKOS_BUILD_DIR}"
+    INSTALL_DIR "${KOKKOS_INSTALL_DIR}"
+    
+    CMAKE_ARGS
+        -G "${CMAKE_GENERATOR}"
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+        -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_DIR}
+        
+        -DKokkos_ENABLE_SERIAL=ON
+        -DKokkos_ARCH_NATIVE=OFF
+        -DKokkos_ENABLE_OPENMP=OFF
+        -DKokkos_ENABLE_THREADS=ON
+        -DKokkos_ENABLE_CUDA=OFF
+        -DKokkos_ENABLE_HIP=OFF
+        -DKokkos_ENABLE_TESTS=OFF
+        -DKokkos_ENABLE_EXAMPLES=OFF
+        
+        -DKokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON
+        -DKokkos_ENABLE_LIBDL=OFF
+    
+    BUILD_COMMAND ${CMAKE_COMMAND} --build "${KOKKOS_BUILD_DIR}" --config ${CMAKE_BUILD_TYPE} -j${NPROC}
+    
+    INSTALL_COMMAND ${CMAKE_COMMAND} --install "${KOKKOS_BUILD_DIR}" --config ${CMAKE_BUILD_TYPE}
+    
+    BUILD_ALWAYS OFF
+    LOG_CONFIGURE ON
+    LOG_BUILD ON
+    LOG_INSTALL ON
+)
+
+set(Kokkos_DIR "${KOKKOS_INSTALL_DIR}/lib/cmake/Kokkos" CACHE PATH "Path to Kokkos CMake config")
diff --git a/include/parallel/backends.hpp b/include/parallel/backends.hpp
index b63d128c..694fdc32 100644
--- a/include/parallel/backends.hpp
+++ b/include/parallel/backends.hpp
@@ -3,6 +3,8 @@
 #include <oneapi/tbb/info.h>
 #include <oneapi/tbb/parallel_for.h>
 
+// NOLINTNEXTLINE(misc-header-include-cycle)
+#include <Kokkos_Core.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -17,7 +19,8 @@ enum class Backend : std::uint8_t {
   kSeq = 0,
   kThreads = 1,
   kTbb = 2,
-  kOmp = 3
+  kOmp = 3,
+  kKokkos = 4
 };
 
 struct Options {
@@ -116,5 +119,28 @@ inline void impl_omp(std::size_t count,
 }
 #endif
 
+inline void impl_kokkos(std::size_t count,
+                        const std::function<void(std::size_t)>& func,
+                        const Options& opt) {
+  if (count == 0) return;
+  static std::once_flag init_flag;
+  std::call_once(init_flag, [&opt]() {
+    int num_threads =
+        opt.max_threads > 0
+            ? opt.max_threads
+            : static_cast<int>(std::thread::hardware_concurrency());
+
+    Kokkos::InitializationSettings args;
+    args.set_num_threads(num_threads);
+    Kokkos::initialize(args);
+
+    std::atexit([]() { Kokkos::finalize(); });
+  });
+
+  auto kokkos_func = [&func](const std::size_t i) { func(i); };
+  Kokkos::parallel_for("parallel_for", count, kokkos_func);
+  Kokkos::fence();
+}
+
 }  // namespace parallel
 }  // namespace it_lab_ai
diff --git a/include/parallel/parallel.hpp b/include/parallel/parallel.hpp
index 5232dcae..5834aeba 100644
--- a/include/parallel/parallel.hpp
+++ b/include/parallel/parallel.hpp
@@ -29,7 +29,8 @@ inline Backend select_backend(const Options& opt, std::size_t n) {
   }
 
   if (opt.backend == Backend::kSeq || opt.backend == Backend::kThreads ||
-      opt.backend == Backend::kTbb || opt.backend == Backend::kOmp) {
+      opt.backend == Backend::kTbb || opt.backend == Backend::kOmp ||
+      opt.backend == Backend::kKokkos) {
     return opt.backend;
   }
 
@@ -56,6 +57,9 @@ inline void parallel_for(std::size_t count, Func&& func,
     case Backend::kOmp:
       impl_omp(count, std::forward<Func>(func), opt);
       break;
+    case Backend::kKokkos:
+      impl_kokkos(count, std::forward<Func>(func), opt);
+      break;
   }
 }
 
diff --git a/src/graph/CMakeLists.txt b/src/graph/CMakeLists.txt
index 9054e8a2..b84b830d 100644
--- a/src/graph/CMakeLists.txt
+++ b/src/graph/CMakeLists.txt
@@ -1,3 +1,4 @@
 file(GLOB_RECURSE graph_src *.cpp)
 add_library(graph_lib STATIC "${GRAPH_HEADERS}" "${graph_src}")
 target_link_libraries(graph_lib PUBLIC TBB_unified)
+add_dependencies(graph_lib kokkos_external)
diff --git a/src/graph_transformations/CMakeLists.txt b/src/graph_transformations/CMakeLists.txt
index 6942b48f..08050170 100644
--- a/src/graph_transformations/CMakeLists.txt
+++ b/src/graph_transformations/CMakeLists.txt
@@ -1,3 +1,4 @@
 file(GLOB_RECURSE graphT_src *.cpp)
 add_library(graphT_lib STATIC "${GRAPHT_HEADERS}" "${graphT_src}")
 target_link_libraries(graphT_lib PUBLIC TBB_unified)
+add_dependencies(graphT_lib kokkos_external)
diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt
index e441507f..35382404 100644
--- a/src/layers/CMakeLists.txt
+++ b/src/layers/CMakeLists.txt
@@ -4,3 +4,4 @@ add_library(layers_lib STATIC "${LAYERS_HEADERS}" "${layers_src}")
 target_link_libraries(layers_lib PUBLIC TBB_unified)
 target_link_libraries(layers_lib PUBLIC OpenMP::OpenMP_CXX)
 target_link_libraries(layers_lib PUBLIC dnnl)
+target_link_libraries(layers_lib PUBLIC Kokkos_imported)
diff --git a/src/layers_oneDNN/CMakeLists.txt b/src/layers_oneDNN/CMakeLists.txt
index e4ee067e..23763d82 100644
--- a/src/layers_oneDNN/CMakeLists.txt
+++ b/src/layers_oneDNN/CMakeLists.txt
@@ -1,6 +1,7 @@
 file(GLOB_RECURSE layers_oneDNN_src *.cpp)
 add_library(layers_oneDNN_lib STATIC "${LAYERS_ONEDNN_HEADERS}" "${layers_oneDNN_src}")
 target_link_libraries(layers_oneDNN_lib PUBLIC dnnl TBB_unified)
+add_dependencies(layers_oneDNN_lib kokkos_external)
 target_include_directories(layers_oneDNN_lib PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/../../include
 )
diff --git a/test/single_layer/test_ewlayer.cpp b/test/single_layer/test_ewlayer.cpp
index 0174fbf9..e925a310 100644
--- a/test/single_layer/test_ewlayer.cpp
+++ b/test/single_layer/test_ewlayer.cpp
@@ -216,181 +216,3 @@ TEST(ewlayer, new_ewlayer_can_sigmoid_float_extreme_values) {
     EXPECT_NEAR((*out[0].as<float>())[i], expected_output[i], 1e-5F);
   }
 }
-
-TEST(ewlayer, parallel_for_ew) {
-  EWLayer layer("relu");
-
-  std::vector<int> vec(8000000, -1);
-  Tensor input = make_tensor<int>(vec);
-  Tensor output;
-  std::vector<Tensor> in{input};
-  std::vector<Tensor> out{output};
-
-  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
-                                      ParBackend::kTbb, ParBackend::kOmp};
-
-  for (auto backend : backends) {
-    RuntimeOptions options;
-    options.setParallelBackend(backend);
-
-    auto start = std::chrono::high_resolution_clock::now();
-    layer.run(in, out, options);
-    auto end = std::chrono::high_resolution_clock::now();
-    auto duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    std::cout << " time: " << duration.count() << " ms" << std::endl;
-    for (size_t i = 0; i < 8000000; i++) {
-      EXPECT_EQ((*out[0].as<int>())[i], 0);
-    }
-  }
-}
-
-TEST(ewlayer, parallel_for_ew_sigmoid_compact) {
-  EWLayer layer("sigmoid");
-
-  std::vector<int> vec(8000000, -1);
-  Tensor input = make_tensor<int>(vec);
-  Tensor output;
-  std::vector<Tensor> in{input};
-  std::vector<Tensor> out{output};
-
-  std::vector<std::pair<ParBackend, std::string>> backends = {
-      {ParBackend::kSeq, "Sequential"},
-      {ParBackend::kThreads, "Threads"},
-      {ParBackend::kTbb, "TBB"},
-      {ParBackend::kOmp, "OpenMP"}};
-
-  std::vector<int> reference_result;
-  bool first = true;
-
-  for (const auto& [backend, name] : backends) {
-    RuntimeOptions options;
-    options.parallel = (backend != ParBackend::kSeq);
-    options.par_backend = backend;
-    if (backend == ParBackend::kThreads) {
-      options.threads = 4;
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    layer.run(in, out, options);
-    auto end = std::chrono::high_resolution_clock::now();
-    auto duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-
-    std::cout << "Sigmoid " << name << " time: " << duration.count() << " ms"
-              << std::endl;
-
-    auto current_result = *out[0].as<int>();
-    if (first) {
-      reference_result = current_result;
-      first = false;
-      for (size_t i = 0; i < 100; i++) {
-        EXPECT_EQ(current_result[i], 0)
-            << "Invalid sigmoid result at index " << i;
-      }
-    } else {
-      for (size_t i = 0; i < reference_result.size(); i++) {
-        EXPECT_EQ(current_result[i], reference_result[i])
-            << "Mismatch with " << name << " at index " << i;
-      }
-    }
-  }
-}
-
-TEST(ewlayer, parallel_for_direct) {
-  const int SIZE = 2000;
-  std::vector<int> matrix1(SIZE * SIZE);
-  std::vector<int> matrix2(SIZE * SIZE);
-  std::vector<int> result(SIZE * SIZE);
-
-  for (int i = 0; i < SIZE * SIZE; ++i) {
-    matrix1[i] = 1;
-    matrix2[i] = 1;
-  }
-
-  auto start = std::chrono::high_resolution_clock::now();
-  parallel::parallel_for(
-      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
-      ParBackend::kSeq);
-
-  auto end = std::chrono::high_resolution_clock::now();
-  auto total_duration =
-      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-
-  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
-
-  start = std::chrono::high_resolution_clock::now();
-  parallel::parallel_for(
-      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
-      ParBackend::kThreads);
-  end = std::chrono::high_resolution_clock::now();
-  total_duration =
-      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
-
-  start = std::chrono::high_resolution_clock::now();
-  parallel::parallel_for(
-      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
-      ParBackend::kTbb);
-  end = std::chrono::high_resolution_clock::now();
-  total_duration =
-      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
-
-  start = std::chrono::high_resolution_clock::now();
-  parallel::parallel_for(
-      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
-      ParBackend::kOmp);
-  end = std::chrono::high_resolution_clock::now();
-  total_duration =
-      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
-}
-
-TEST(ewlayer, parallel_for_notmatrix) {
-  const int SIZE = 3000;
-  std::vector<int> matrix1(SIZE * SIZE);
-  std::vector<int> result(SIZE * SIZE);
-
-  for (int i = 0; i < SIZE * SIZE; ++i) {
-    matrix1[i] = 1;
-  }
-
-  auto start = std::chrono::high_resolution_clock::now();
-  parallel::parallel_for(
-      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
-      ParBackend::kSeq);
-
-  auto end = std::chrono::high_resolution_clock::now();
-  auto total_duration =
-      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-
-  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
-
-  start = std::chrono::high_resolution_clock::now();
-  parallel::parallel_for(
-      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
-      ParBackend::kThreads);
-  end = std::chrono::high_resolution_clock::now();
-  total_duration =
-      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
-
-  start = std::chrono::high_resolution_clock::now();
-  parallel::parallel_for(
-      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
-      ParBackend::kTbb);
-  end = std::chrono::high_resolution_clock::now();
-  total_duration =
-      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
-
-  start = std::chrono::high_resolution_clock::now();
-  parallel::parallel_for(
-      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
-      ParBackend::kOmp);
-  end = std::chrono::high_resolution_clock::now();
-  total_duration =
-      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
-}
diff --git a/test/single_layer_parall_version/test_ewlayer_parall.cpp b/test/single_layer_parall_version/test_ewlayer_parall.cpp
new file mode 100644
index 00000000..deb4660a
--- /dev/null
+++ b/test/single_layer_parall_version/test_ewlayer_parall.cpp
@@ -0,0 +1,260 @@
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "layers/EWLayer.hpp"
+
+#define ENABLE_TIMING_OUTPUT 1
+
+#if ENABLE_TIMING_OUTPUT
+#define PRINT_TIMING(msg) std::cout << msg << std::endl
+#else
+#define PRINT_TIMING(msg) ((void)0)
+#endif
+
+using namespace it_lab_ai;
+
+TEST(ewlayer_parall, parallel_for_ew_relu) {
+  EWLayer layer("relu");
+
+  std::vector<int> vec(8000000, -1);
+  Tensor input = make_tensor<int>(vec);
+  Tensor output;
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING(" time: " << duration.count() << " ms");
+    for (size_t i = 0; i < 8000000; i++) {
+      EXPECT_EQ((*out[0].as<int>())[i], 0);
+    }
+  }
+}
+
+TEST(ewlayer_parall, parallel_for_sigmoid) {
+  EWLayer layer("sigmoid");
+
+  std::vector<int> vec(8000000, -1);
+  Tensor input = make_tensor<int>(vec);
+  Tensor output;
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING(" time: " << duration.count() << " ms");
+    for (size_t i = 0; i < 8000000; i++) {
+      EXPECT_EQ((*out[0].as<int>())[i], 0);
+    }
+  }
+}
+
+TEST(ewlayer_parall, parallel_for_minus) {
+  EWLayer layer("minus");
+
+  std::vector<int> vec(8000000, -1);
+  Tensor input = make_tensor<int>(vec);
+  Tensor output;
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING(" time: " << duration.count() << " ms");
+    for (size_t i = 0; i < 8000000; i++) {
+      EXPECT_EQ((*out[0].as<int>())[i], 1);
+    }
+  }
+}
+
+TEST(ewlayer_parall, parallel_for_linear) {
+  EWLayer layer("linear", 2.0F, 2.0F);
+
+  std::vector<int> vec(8000000, -1);
+  Tensor input = make_tensor<int>(vec);
+  Tensor output;
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
+                                      ParBackend::kTbb, ParBackend::kOmp,
+                                      ParBackend::kKokkos};
+
+  for (auto backend : backends) {
+    RuntimeOptions options;
+    options.setParallelBackend(backend);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    layer.run(in, out, options);
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    PRINT_TIMING(" time: " << duration.count() << " ms");
+    for (size_t i = 0; i < 8000000; i++) {
+      EXPECT_EQ((*out[0].as<int>())[i], 0);
+    }
+  }
+}
+
+TEST(ewlayer_parall, parallel_for_direct) {
+  const int SIZE = 2000;
+  std::vector<int> matrix1(SIZE * SIZE);
+  std::vector<int> matrix2(SIZE * SIZE);
+  std::vector<int> result(SIZE * SIZE);
+
+  for (int i = 0; i < SIZE * SIZE; ++i) {
+    matrix1[i] = 1;
+    matrix2[i] = 1;
+  }
+
+  auto start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
+      ParBackend::kSeq);
+
+  auto end = std::chrono::high_resolution_clock::now();
+  auto total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
+      ParBackend::kThreads);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
+      ParBackend::kTbb);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
+      ParBackend::kOmp);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
+      ParBackend::kKokkos);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+}
+
+TEST(ewlayer_parall, parallel_for_notmatrix) {
+  const int SIZE = 3000;
+  std::vector<int> matrix1(SIZE * SIZE);
+  std::vector<int> result(SIZE * SIZE);
+
+  for (int i = 0; i < SIZE * SIZE; ++i) {
+    matrix1[i] = 1;
+  }
+
+  auto start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
+      ParBackend::kSeq);
+
+  auto end = std::chrono::high_resolution_clock::now();
+  auto total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
+      ParBackend::kThreads);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
+      ParBackend::kTbb);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
+      ParBackend::kOmp);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
+      ParBackend::kKokkos);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  PRINT_TIMING(" time: " << total_duration.count() << " ms");
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+}