Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@
[submodule "3rdparty/oneDNN"]
path = 3rdparty/oneDNN
url = https://github.com/uxlfoundation/oneDNN
[submodule "3rdparty/kokkos"]
path = 3rdparty/kokkos
url = https://github.com/kokkos/kokkos.git
branch = release-candidate-5.0.1
1 change: 1 addition & 0 deletions 3rdparty/kokkos
Submodule kokkos added at f57230
26 changes: 25 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ if(ENABLE_STATISTIC_WEIGHTS)
add_definitions(-DENABLE_STATISTIC_WEIGHTS)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)

enable_testing()

Expand All @@ -43,6 +43,30 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
add_subdirectory(3rdparty)

include(cmake/opencv_config.cmake)
include(cmake/kokkos_config.cmake)

include_directories("${KOKKOS_INSTALL_DIR}/include")

add_library(Kokkos_imported INTERFACE)
add_dependencies(Kokkos_imported kokkos_external)

target_include_directories(Kokkos_imported INTERFACE
"${KOKKOS_INSTALL_DIR}/include"
)

target_link_directories(Kokkos_imported INTERFACE
"${KOKKOS_INSTALL_DIR}/lib"
)

if(WIN32)
target_link_libraries(Kokkos_imported INTERFACE kokkoscore kokkoscontainers)
else()
target_link_libraries(Kokkos_imported INTERFACE kokkos)
endif()

if(MSVC)
add_compile_options(/wd4267 /wd4244 /wd4127 /wd4324)
endif()

if (NOT WIN32)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror")
Expand Down
14 changes: 7 additions & 7 deletions app/Graph/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,8 @@ void build_graph(it_lab_ai::Graph& graph, it_lab_ai::Tensor& input,
try {
std::sort(connection_list.begin(), connection_list.end(),
[&](const auto& a, const auto& b) {
if (!name_to_layer_ptr.count(a.first) ||
!name_to_layer_ptr.count(b.first)) {
if (!name_to_layer_ptr.contains(a.first) ||
!name_to_layer_ptr.contains(b.first)) {
return false;
}
return name_to_layer_ptr[a.first]->getID() <
Expand All @@ -239,8 +239,8 @@ void build_graph(it_lab_ai::Graph& graph, it_lab_ai::Tensor& input,
}

for (const auto& [source_name, target_name] : connection_list) {
if (name_to_layer_ptr.count(source_name) &&
name_to_layer_ptr.count(target_name)) {
if (name_to_layer_ptr.contains(source_name) &&
name_to_layer_ptr.contains(target_name)) {
if (target_name.find("Concat") != std::string::npos ||
name_to_layer_ptr[target_name]->getName() == it_lab_ai::kConcat) {
if (concat_connections.find(target_name) != concat_connections.end()) {
Expand Down Expand Up @@ -573,7 +573,7 @@ ParseResult parse_json_model(RuntimeOptions options,
std::string constant_name = inputs[1].get<std::string>();
constant_name = get_base_layer_name(constant_name);

if (layer_parameters.count(constant_name)) {
if (layer_parameters.contains(constant_name)) {
splits = layer_parameters[constant_name];
} else if (constant_name.find("onnx::") != std::string::npos) {
splits = last_constant_value;
Expand Down Expand Up @@ -771,7 +771,7 @@ ParseResult parse_json_model(RuntimeOptions options,
std::string constant_name = inputs[1].get<std::string>();
constant_name = get_base_layer_name(constant_name);

if (layer_parameters.count(constant_name)) {
if (layer_parameters.contains(constant_name)) {
shape = layer_parameters[constant_name];
}
}
Expand Down Expand Up @@ -833,7 +833,7 @@ ParseResult parse_json_model(RuntimeOptions options,
std::string constant_name = inputs[1].get<std::string>();
constant_name = get_base_layer_name(constant_name);

if (layer_parameters.count(constant_name)) {
if (layer_parameters.contains(constant_name)) {
axes = layer_parameters[constant_name];
} else if (constant_name.find("onnx::") != std::string::npos) {
axes = last_constant_value;
Expand Down
39 changes: 39 additions & 0 deletions cmake/kokkos_config.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
include(ExternalProject)

set(KOKKOS_BUILD_DIR "${CMAKE_BINARY_DIR}/3rdparty/kokkos_build")
set(KOKKOS_INSTALL_DIR "${CMAKE_BINARY_DIR}/3rdparty/kokkos_install")

ExternalProject_Add(
kokkos_external
SOURCE_DIR "${CMAKE_SOURCE_DIR}/3rdparty/kokkos"
BINARY_DIR "${KOKKOS_BUILD_DIR}"
INSTALL_DIR "${KOKKOS_INSTALL_DIR}"

CMAKE_ARGS
-G "${CMAKE_GENERATOR}"
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_DIR}

-DKokkos_ENABLE_SERIAL=ON
-DKokkos_ARCH_NATIVE=OFF
-DKokkos_ENABLE_OPENMP=OFF
-DKokkos_ENABLE_THREADS=ON
-DKokkos_ENABLE_CUDA=OFF
-DKokkos_ENABLE_HIP=OFF
-DKokkos_ENABLE_TESTS=OFF
-DKokkos_ENABLE_EXAMPLES=OFF

-DKokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON
-DKokkos_ENABLE_LIBDL=OFF

BUILD_COMMAND ${CMAKE_COMMAND} --build "${KOKKOS_BUILD_DIR}" --config ${CMAKE_BUILD_TYPE} -j${NPROC}

INSTALL_COMMAND ${CMAKE_COMMAND} --install "${KOKKOS_BUILD_DIR}" --config ${CMAKE_BUILD_TYPE}

BUILD_ALWAYS OFF
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
)

set(Kokkos_DIR "${KOKKOS_INSTALL_DIR}/lib/cmake/Kokkos" CACHE PATH "Path to Kokkos CMake config")
28 changes: 27 additions & 1 deletion include/parallel/backends.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <oneapi/tbb/info.h>
#include <oneapi/tbb/parallel_for.h>

// NOLINTNEXTLINE(misc-header-include-cycle)
#include <Kokkos_Core.hpp>
#include <cstddef>
#include <cstdint>
#include <functional>
Expand All @@ -17,7 +19,8 @@ enum class Backend : std::uint8_t {
kSeq = 0,
kThreads = 1,
kTbb = 2,
kOmp = 3
kOmp = 3,
kKokkos = 4
};

struct Options {
Expand Down Expand Up @@ -116,5 +119,28 @@ inline void impl_omp(std::size_t count,
}
#endif

inline void impl_kokkos(std::size_t count,
const std::function<void(std::size_t)>& func,
const Options& opt) {
if (count == 0) return;
static std::once_flag init_flag;
std::call_once(init_flag, [&opt]() {
int num_threads =
opt.max_threads > 0
? opt.max_threads
: static_cast<int>(std::thread::hardware_concurrency());

Kokkos::InitializationSettings args;
args.set_num_threads(num_threads);
Kokkos::initialize(args);

std::atexit([]() { Kokkos::finalize(); });
});

auto kokkos_func = [&func](const std::size_t i) { func(i); };
Kokkos::parallel_for("parallel_for", count, kokkos_func);
Kokkos::fence();
}

} // namespace parallel
} // namespace it_lab_ai
6 changes: 5 additions & 1 deletion include/parallel/parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ inline Backend select_backend(const Options& opt, std::size_t n) {
}

if (opt.backend == Backend::kSeq || opt.backend == Backend::kThreads ||
opt.backend == Backend::kTbb || opt.backend == Backend::kOmp) {
opt.backend == Backend::kTbb || opt.backend == Backend::kOmp ||
opt.backend == Backend::kKokkos) {
return opt.backend;
}

Expand All @@ -56,6 +57,9 @@ inline void parallel_for(std::size_t count, Func&& func,
case Backend::kOmp:
impl_omp(count, std::forward<Func>(func), opt);
break;
case Backend::kKokkos:
impl_kokkos(count, std::forward<Func>(func), opt);
break;
}
}

Expand Down
1 change: 1 addition & 0 deletions src/layers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ add_library(layers_lib STATIC "${LAYERS_HEADERS}" "${layers_src}")
target_link_libraries(layers_lib PUBLIC TBB_unified)
target_link_libraries(layers_lib PUBLIC OpenMP::OpenMP_CXX)
target_link_libraries(layers_lib PUBLIC dnnl)
target_link_libraries(layers_lib PUBLIC Kokkos_imported)
178 changes: 0 additions & 178 deletions test/single_layer/test_ewlayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,181 +216,3 @@ TEST(ewlayer, new_ewlayer_can_sigmoid_float_extreme_values) {
EXPECT_NEAR((*out[0].as<float>())[i], expected_output[i], 1e-5F);
}
}

TEST(ewlayer, parallel_for_ew) {
EWLayer layer("relu");

std::vector<int> vec(8000000, -1);
Tensor input = make_tensor<int>(vec);
Tensor output;
std::vector<Tensor> in{input};
std::vector<Tensor> out{output};

std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
ParBackend::kTbb, ParBackend::kOmp};

for (auto backend : backends) {
RuntimeOptions options;
options.setParallelBackend(backend);

auto start = std::chrono::high_resolution_clock::now();
layer.run(in, out, options);
auto end = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << " time: " << duration.count() << " ms" << std::endl;
for (size_t i = 0; i < 8000000; i++) {
EXPECT_EQ((*out[0].as<int>())[i], 0);
}
}
}

TEST(ewlayer, parallel_for_ew_sigmoid_compact) {
EWLayer layer("sigmoid");

std::vector<int> vec(8000000, -1);
Tensor input = make_tensor<int>(vec);
Tensor output;
std::vector<Tensor> in{input};
std::vector<Tensor> out{output};

std::vector<std::pair<ParBackend, std::string>> backends = {
{ParBackend::kSeq, "Sequential"},
{ParBackend::kThreads, "Threads"},
{ParBackend::kTbb, "TBB"},
{ParBackend::kOmp, "OpenMP"}};

std::vector<int> reference_result;
bool first = true;

for (const auto& [backend, name] : backends) {
RuntimeOptions options;
options.parallel = (backend != ParBackend::kSeq);
options.par_backend = backend;
if (backend == ParBackend::kThreads) {
options.threads = 4;
}

auto start = std::chrono::high_resolution_clock::now();
layer.run(in, out, options);
auto end = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

std::cout << "Sigmoid " << name << " time: " << duration.count() << " ms"
<< std::endl;

auto current_result = *out[0].as<int>();
if (first) {
reference_result = current_result;
first = false;
for (size_t i = 0; i < 100; i++) {
EXPECT_EQ(current_result[i], 0)
<< "Invalid sigmoid result at index " << i;
}
} else {
for (size_t i = 0; i < reference_result.size(); i++) {
EXPECT_EQ(current_result[i], reference_result[i])
<< "Mismatch with " << name << " at index " << i;
}
}
}
}

TEST(ewlayer, parallel_for_direct) {
const int SIZE = 2000;
std::vector<int> matrix1(SIZE * SIZE);
std::vector<int> matrix2(SIZE * SIZE);
std::vector<int> result(SIZE * SIZE);

for (int i = 0; i < SIZE * SIZE; ++i) {
matrix1[i] = 1;
matrix2[i] = 1;
}

auto start = std::chrono::high_resolution_clock::now();
parallel::parallel_for(
SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
ParBackend::kSeq);

auto end = std::chrono::high_resolution_clock::now();
auto total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);

start = std::chrono::high_resolution_clock::now();
parallel::parallel_for(
SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
ParBackend::kThreads);
end = std::chrono::high_resolution_clock::now();
total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);

start = std::chrono::high_resolution_clock::now();
parallel::parallel_for(
SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
ParBackend::kTbb);
end = std::chrono::high_resolution_clock::now();
total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);

start = std::chrono::high_resolution_clock::now();
parallel::parallel_for(
SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
ParBackend::kOmp);
end = std::chrono::high_resolution_clock::now();
total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
}

TEST(ewlayer, parallel_for_notmatrix) {
const int SIZE = 3000;
std::vector<int> matrix1(SIZE * SIZE);
std::vector<int> result(SIZE * SIZE);

for (int i = 0; i < SIZE * SIZE; ++i) {
matrix1[i] = 1;
}

auto start = std::chrono::high_resolution_clock::now();
parallel::parallel_for(
SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
ParBackend::kSeq);

auto end = std::chrono::high_resolution_clock::now();
auto total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);

start = std::chrono::high_resolution_clock::now();
parallel::parallel_for(
SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
ParBackend::kThreads);
end = std::chrono::high_resolution_clock::now();
total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);

start = std::chrono::high_resolution_clock::now();
parallel::parallel_for(
SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
ParBackend::kTbb);
end = std::chrono::high_resolution_clock::now();
total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);

start = std::chrono::high_resolution_clock::now();
parallel::parallel_for(
SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
ParBackend::kOmp);
end = std::chrono::high_resolution_clock::now();
total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
}
Loading
Loading