diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3d18507a..5ae64e1b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -22,6 +22,13 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") # endif() +# Set OpenMP +find_package(OpenMP) +include(ProcessorCount) +ProcessorCount(MAX_NUMBER_THREADS) +message("Setting maximum number of threads to ${MAX_NUMBER_THREADS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMAX_NUMBER_THREADS=${MAX_NUMBER_THREADS}") + ### Download GoogleTest include(FetchContent) FetchContent_Declare( diff --git a/src/applications/adaptive.cpp b/src/applications/adaptive.cpp index a622a0a3..32bd3a24 100644 --- a/src/applications/adaptive.cpp +++ b/src/applications/adaptive.cpp @@ -114,10 +114,12 @@ int main(int argc, char* argv[]) { std::string problem, domain; size_t initial_refines = 0; size_t max_dofs = 0; + size_t num_threads = 1; bool calculate_condition_numbers = false; bool print_centers = false; bool print_sampling = false; - bool print_time_apply = false; + bool print_time_apply = true; + bool print_bilforms = false; std::vector print_time_slices; boost::program_options::options_description problem_optdesc( "Problem options"); @@ -133,7 +135,9 @@ int main(int argc, char* argv[]) { "print_sampling", po::value(&print_sampling))( "print_time_slices", po::value>(&print_time_slices)->multitoken())( - "print_time_apply", po::value(&print_time_apply)); + "print_time_apply", po::value(&print_time_apply))( + "print_bilforms", po::value(&print_bilforms))( + "num_threads", po::value(&num_threads)); std::sort(print_time_slices.begin(), print_time_slices.end()); @@ -165,10 +169,20 @@ int main(int argc, char* argv[]) { po::store(po::command_line_parser(argc, argv).options(cmdline_options).run(), vm); po::notify(vm); + assert(num_threads > 0); + if (num_threads > 1 && adapt_opts.use_cache) { + std::cout << "Multithreading is only enabled for no-cache." << std::endl; + return 1; + } + assert(num_threads <= omp_get_max_threads()); + assert(num_threads <= MAX_NUMBER_THREADS); + omp_set_num_threads(num_threads); + std::cout << "Problem options:" << std::endl; std::cout << "\tProblem: " << problem << std::endl; std::cout << "\tDomain: " << domain << "; initial-refines: " << initial_refines << std::endl; + std::cout << "\tNumber-threads: " << num_threads << std::endl; std::cout << std::endl; std::cout << adapt_opts << std::endl; @@ -215,9 +229,13 @@ int main(int argc, char* argv[]) { // A slight overestimate. ndof_Xd = vec_Xd->Bfs().size(); + size_t ndof_Xd_time = vec_Xd->Project_0()->Bfs().size(); + size_t ndof_Xd_space = vec_Xd->Project_1()->Bfs().size(); size_t ndof_Xdd = heat_eq.vec_Xdd()->Bfs().size(); size_t ndof_Ydd = heat_eq.vec_Ydd()->Bfs().size(); std::cout << "iter: " << ++iter << "\n\tXDelta-size: " << ndof_Xd + << "\n\tXDelta-space-size: " << ndof_Xd_space + << "\n\tXDelta-time-size: " << ndof_Xd_time << "\n\tXDelta-Gradedness: " << vec_Xd->Gradedness(&max_gradedness) << "\n\tXDeltaDelta-size: " << ndof_Xdd @@ -317,18 +335,25 @@ int main(int argc, char* argv[]) { if (print_time_apply) { auto heat_d_dd = heat_eq.heat_d_dd(); - std::cout << "\n\tA-time-per-apply: " << heat_d_dd->A()->TimePerApply() - << "\n\tB-time-per-apply: " << heat_d_dd->B()->TimePerApply() - << "\n\tBT-time-per-apply: " << heat_d_dd->BT()->TimePerApply() - << "\n\tG-time-per-apply: " << heat_d_dd->G()->TimePerApply() - << "\n\tP_Y-time-per-apply: " - << heat_d_dd->P_Y()->TimePerApply() - << "\n\tP_X-time-per-apply: " - << heat_d_dd->P_X()->TimePerApply() - << "\n\tS-time-per-apply: " << heat_d_dd->S()->TimePerApply() - << "\n\ttotal-time-apply: " << heat_d_dd->TotalTimeApply() - << "\n\ttotal-time-construct: " - << heat_d_dd->TotalTimeConstruct() << std::flush; + std::cout + << "\n\tA-time-per-apply: " << heat_d_dd->A()->TimePerApply() + << "\n\tB-time-per-apply: " << heat_d_dd->B()->TimePerApply() + << "\n\tB-A-time-per-apply: " << heat_d_dd->B()->A()->TimePerApply() + << "\n\tB-B-time-per-apply: " << heat_d_dd->B()->B()->TimePerApply() + << "\n\tBT-time-per-apply: " << heat_d_dd->BT()->TimePerApply() + << "\n\tG-time-per-apply: " << heat_d_dd->G()->TimePerApply() + << "\n\tP_Y-time-per-apply: " << heat_d_dd->P_Y()->TimePerApply() + << "\n\tP_X-time-per-apply: " << heat_d_dd->P_X()->TimePerApply() + << "\n\tS-time-per-apply: " << heat_d_dd->S()->TimePerApply() + << "\n\ttotal-time-apply: " << heat_d_dd->TotalTimeApply() + << "\n\ttotal-time-construct: " << heat_d_dd->TotalTimeConstruct() + << std::flush; + } + if (print_bilforms) { + auto heat_d_dd = heat_eq.heat_d_dd(); + std::cout << "\n\tB-A-bilforms: " << heat_d_dd->B()->A()->Information() + << "\n\tP_Y-bilforms: " << heat_d_dd->P_Y()->Information() + << std::flush; } if (print_centers) { diff --git a/src/applications/uniform.cpp b/src/applications/uniform.cpp index 4d52bc37..5761d727 100644 --- a/src/applications/uniform.cpp +++ b/src/applications/uniform.cpp @@ -53,10 +53,11 @@ int main(int argc, char* argv[]) { size_t initial_refines = 0; size_t max_level = 0; size_t max_dofs = 0; + size_t num_threads = 1; std::string refine; bool calculate_condition_PY = false; bool calculate_condition_PX = false; - bool print_time_apply = false; + bool print_time_apply = true; bool print_centers = false; double solve_rtol = 1e-5; boost::program_options::options_description problem_optdesc( @@ -70,6 +71,7 @@ int main(int argc, char* argv[]) { ->default_value(std::numeric_limits::max()))( "max_dofs", po::value(&max_dofs)->default_value( std::numeric_limits::max()))( + "num_threads", po::value(&num_threads))( "refine", po::value(&refine)->default_value("sparse"))( "print_centers", po::value(&print_centers))( "print_time_apply", po::value(&print_time_apply))( @@ -118,6 +120,10 @@ int main(int argc, char* argv[]) { std::cout << adapt_opts << "\tsolve-rtol: " << solve_rtol << std::endl << std::endl; + assert(num_threads > 0 && num_threads <= omp_get_max_threads() && + num_threads <= MAX_NUMBER_THREADS); + omp_set_num_threads(num_threads); + auto T = InitialTriangulation(domain, initial_refines); auto B = Time::Bases(); auto vec_Xd = std::make_shared< @@ -229,18 +235,19 @@ int main(int argc, char* argv[]) { if (print_time_apply) { auto heat_d_dd = heat_eq.heat_d_dd(); - std::cout << "\n\tA-time-per-apply: " << heat_d_dd->A()->TimePerApply() - << "\n\tB-time-per-apply: " << heat_d_dd->B()->TimePerApply() - << "\n\tBT-time-per-apply: " << heat_d_dd->BT()->TimePerApply() - << "\n\tG-time-per-apply: " << heat_d_dd->G()->TimePerApply() - << "\n\tP_Y-time-per-apply: " - << heat_d_dd->P_Y()->TimePerApply() - << "\n\tP_X-time-per-apply: " - << heat_d_dd->P_X()->TimePerApply() - << "\n\tS-time-per-apply: " << heat_d_dd->S()->TimePerApply() - << "\n\ttotal-time-apply: " << heat_d_dd->TotalTimeApply() - << "\n\ttotal-time-construct: " - << heat_d_dd->TotalTimeConstruct() << std::flush; + std::cout + << "\n\tA-time-per-apply: " << heat_d_dd->A()->TimePerApply() + << "\n\tB-time-per-apply: " << heat_d_dd->B()->TimePerApply() + << "\n\tB-A-time-per-apply: " << heat_d_dd->B()->A()->TimePerApply() + << "\n\tB-B-time-per-apply: " << heat_d_dd->B()->B()->TimePerApply() + << "\n\tBT-time-per-apply: " << heat_d_dd->BT()->TimePerApply() + << "\n\tG-time-per-apply: " << heat_d_dd->G()->TimePerApply() + << "\n\tP_Y-time-per-apply: " << heat_d_dd->P_Y()->TimePerApply() + << "\n\tP_X-time-per-apply: " << heat_d_dd->P_X()->TimePerApply() + << "\n\tS-time-per-apply: " << heat_d_dd->S()->TimePerApply() + << "\n\ttotal-time-apply: " << heat_d_dd->TotalTimeApply() + << "\n\ttotal-time-construct: " << heat_d_dd->TotalTimeConstruct() + << std::flush; } if (print_centers) { diff --git a/src/datastructures/boost.hpp b/src/datastructures/boost.hpp index a14afec6..0ddc15ed 100644 --- a/src/datastructures/boost.hpp +++ b/src/datastructures/boost.hpp @@ -3,6 +3,7 @@ #include #include #include +#include template using SmallVector = boost::container::small_vector; diff --git a/src/datastructures/multi_tree_view.ipp b/src/datastructures/multi_tree_view.ipp index 452e39c1..20fdf709 100644 --- a/src/datastructures/multi_tree_view.ipp +++ b/src/datastructures/multi_tree_view.ipp @@ -87,13 +87,14 @@ std::vector MultiNodeViewInterface::Union( // Now do the union magic in all dimensions. static_for([&queue, &my_node, &other_node, &call_filter](auto i) { // Get a list of all children of the other_node in axis `i`. - static std::vector filtered_children; + static thread_local std::vector filtered_children; filtered_children.clear(); for (const auto& other_child_i : other_node->children(i)) if (call_filter(other_child_i)) filtered_children.emplace_back(other_child_i); - static std::vector> other_children_i; + static thread_local std::vector> + other_children_i; other_children_i.clear(); for (const auto& other_child_i : filtered_children) other_children_i.emplace_back(std::get(other_child_i->nodes())); diff --git a/src/datastructures/tree.hpp b/src/datastructures/tree.hpp index 45bbbd1c..c70b48be 100644 --- a/src/datastructures/tree.hpp +++ b/src/datastructures/tree.hpp @@ -1,5 +1,8 @@ #pragma once +#include + #include +#include #include #include #include @@ -17,6 +20,12 @@ using T_func_noop = decltype(func_noop); using T_func_true = decltype(func_true); using T_func_false = decltype(func_false); +// Global variable holding the current thread number. +#ifndef MAX_NUMBER_THREADS +#define MAX_NUMBER_THREADS 1 +#endif +static thread_local int thread_number = omp_get_thread_num(); + template struct NodeTrait; // This should define N_children and N_parents. @@ -38,8 +47,8 @@ class Node { } int level() const { return level_; } - bool marked() const { return marked_; } - void set_marked(bool value) { marked_ = value; } + bool marked() const { return marked_[thread_number]; } + void set_marked(bool value) { marked_[thread_number] = value; } bool is_leaf() const { return children_.size() == 0; } inline bool is_metaroot() const { return (level_ == -1); } const auto &parents() const { return parents_; } @@ -48,20 +57,19 @@ class Node { // General data field for universal storage. template T *data() { - assert(data_ != nullptr); - return static_cast(data_); + assert(data_[thread_number] != nullptr); + return static_cast(data_[thread_number]); } - template void set_data(T *value) { - assert(data_ == nullptr); - data_ = static_cast(value); + assert(data_[thread_number] == nullptr); + data_[thread_number] = static_cast(value); } void reset_data() { - assert(data_ != nullptr); - data_ = nullptr; + assert(data_[thread_number] != nullptr); + data_[thread_number] = nullptr; } - bool has_data() { return data_ != nullptr; } + bool has_data() { return data_[thread_number] != nullptr; } template std::vector Bfs(bool include_metaroot = false, @@ -93,9 +101,9 @@ class Node { } protected: - bool marked_ = false; int level_; - void *data_ = nullptr; + std::array marked_{0}; + std::array data_{nullptr}; // Store children/parents as raw pointers. SmallVector::N_children> children_; diff --git a/src/space/CMakeLists.txt b/src/space/CMakeLists.txt index 473fcbfe..dc0df2dd 100644 --- a/src/space/CMakeLists.txt +++ b/src/space/CMakeLists.txt @@ -1,4 +1,5 @@ add_library(space STATIC triangulation.cpp initial_triangulation.cpp basis.cpp operators.cpp triangulation_view.cpp linear_form.cpp integration.cpp) +target_link_libraries(space PUBLIC OpenMP::OpenMP_CXX) # Executables add_executable(space_adaptive adaptive.cpp) diff --git a/src/space/operators.cpp b/src/space/operators.cpp index bcaf33df..2bcb838a 100644 --- a/src/space/operators.cpp +++ b/src/space/operators.cpp @@ -213,13 +213,13 @@ void CGInverse::ApplySingleScale(Eigen::VectorXd &vec_SS) const { // Define the class variables. template -std::vector>> +thread_local std::vector>> MultigridPreconditioner::row_mat; template -std::vector> +thread_local std::vector> MultigridPreconditioner::patches; template -std::vector> +thread_local std::vector> MultigridPreconditioner::vertices_relaxation; template @@ -346,12 +346,12 @@ void MultigridPreconditioner::ApplySingleScale( // Shortcut. const uint V = triang_.V; - // Reuse a static variable for storing the corrections. - static std::vector e; + // Reuse a static variable for storing the row of a matrix. + static thread_local std::vector e; e.reserve(V * 3); // Reuse a static variable for storing the residual in the downard cycle. - static std::vector r_down; + static thread_local std::vector r_down; r_down.reserve(V * 3); // Initialize the multigrid matrix (row_mat). diff --git a/src/space/operators.hpp b/src/space/operators.hpp index 49c1e0ee..d6aeae9a 100644 --- a/src/space/operators.hpp +++ b/src/space/operators.hpp @@ -221,9 +221,9 @@ class MultigridPreconditioner : public BackwardOperator { DirectInverse initial_triang_solver_; // (Static) variables reused for calculation of the multigrid matrix. - static std::vector>> row_mat; - static std::vector> patches; - static std::vector> vertices_relaxation; + static thread_local std::vector>> row_mat; + static thread_local std::vector> patches; + static thread_local std::vector> vertices_relaxation; }; template