diff --git a/apps/test_suite_runner/AbstractTestSuiteRunner.hpp b/apps/test_suite_runner/AbstractTestSuiteRunner.hpp index 86a9f1ea..f023f937 100644 --- a/apps/test_suite_runner/AbstractTestSuiteRunner.hpp +++ b/apps/test_suite_runner/AbstractTestSuiteRunner.hpp @@ -28,15 +28,16 @@ limitations under the License. #include #include -#include -#include #include "ConfigParser.hpp" #include "StatsModules/IStatsModule.hpp" -#include "osp/bsp/model/BspInstance.hpp" #include "osp/auxiliary/io/arch_file_reader.hpp" #include "osp/auxiliary/io/general_file_reader.hpp" +#include "osp/auxiliary/return_status.hpp" +#include "osp/bsp/model/BspInstance.hpp" +#include +#include -//#define EIGEN_FOUND 1 +// #define EIGEN_FOUND 1 #ifdef EIGEN_FOUND #include @@ -83,7 +84,7 @@ class AbstractTestSuiteRunner { if (write_target_object_to_file) { output_target_object_dir_path = parser.global_params.get_child("scheduleDirectory") - .get_value(); + .get_value(); if (output_target_object_dir_path.substr(0, 1) != "/") output_target_object_dir_path = executable_dir + output_target_object_dir_path; if (!output_target_object_dir_path.empty() && !std::filesystem::exists(output_target_object_dir_path)) { @@ -167,13 +168,13 @@ class AbstractTestSuiteRunner { } } - virtual RETURN_STATUS compute_target_object_impl(const BspInstance &instance, std::unique_ptr& target_object, - const pt::ptree &algo_config, - long long &computation_time_ms) = 0; + virtual RETURN_STATUS compute_target_object_impl(const BspInstance &instance, std::unique_ptr &target_object, + const pt::ptree &algo_config, + long long &computation_time_ms) = 0; virtual void create_and_register_statistic_modules(const std::string &module_name) = 0; - virtual void write_target_object_hook(const TargetObjectType&, const std::string &, const std::string &, + virtual void write_target_object_hook(const TargetObjectType &, const std::string &, const std::string &, const std::string &) { } // default in case TargetObjectType cannot be written to file @@ -250,7 +251,7 @@ class AbstractTestSuiteRunner { log_stream << "Start Graph: " + filename_graph + "\n"; BspInstance bsp_instance; - bsp_instance.setArchitecture(arch); + bsp_instance.getArchitecture() = arch; bool graph_status = false; std::string ext; if (filename_graph.rfind('.') != std::string::npos) @@ -268,12 +269,12 @@ class AbstractTestSuiteRunner { SM_csc_int64 L_csc_int64{}; if constexpr (std::is_same_v || std::is_same_v) { - if (ext != "mtx"){ + if (ext != "mtx") { log_stream << "Error: Only .mtx file is accepted for SpTRSV" << std::endl; return 0; } - - if constexpr (std::is_same_v){ + + if constexpr (std::is_same_v) { graph_status = Eigen::loadMarket(L_csr_int32, filename_graph); if (!graph_status) { std::cerr << "Failed to read matrix from " << filename_graph << std::endl; @@ -297,7 +298,7 @@ class AbstractTestSuiteRunner { } } else { #endif - graph_status = file_reader::readGraph(filename_graph, bsp_instance.getComputationalDag()); + graph_status = file_reader::readGraph(filename_graph, bsp_instance.getComputationalDag()); #ifdef EIGEN_FOUND } @@ -309,22 +310,20 @@ class AbstractTestSuiteRunner { for (auto &algorithm_config_pair : parser.scheduler) { const pt::ptree &algo_config = algorithm_config_pair.second; - - std::string current_algo_name = algo_config.get_child("name").get_value(); log_stream << "Start Algorithm " + current_algo_name + "\n"; long long computation_time_ms; - std::unique_ptr target_object; - + std::unique_ptr target_object; + RETURN_STATUS exec_status = compute_target_object_impl(bsp_instance, target_object, algo_config, computation_time_ms); if (exec_status != RETURN_STATUS::OSP_SUCCESS && exec_status != RETURN_STATUS::BEST_FOUND) { if (exec_status == RETURN_STATUS::ERROR) log_stream << "Error computing with " << current_algo_name << "." << std::endl; else if (exec_status == RETURN_STATUS::TIMEOUT) - log_stream << "Scheduler " << current_algo_name << " timed out." << std::endl; + log_stream << "Scheduler " << current_algo_name << " timed out." << std::endl; continue; } diff --git a/apps/test_suite_runner/StringToScheduler/run_bsp_scheduler.hpp b/apps/test_suite_runner/StringToScheduler/run_bsp_scheduler.hpp index 97e7e473..08209efd 100644 --- a/apps/test_suite_runner/StringToScheduler/run_bsp_scheduler.hpp +++ b/apps/test_suite_runner/StringToScheduler/run_bsp_scheduler.hpp @@ -57,8 +57,8 @@ limitations under the License. namespace osp { const std::set get_available_bsp_scheduler_names() { - return {"Serial", "GreedyBsp", "GrowLocal", "BspLocking", "Cilk", "Etf", "GreedyRandom", - "GreedyChildren", "Variance", "MultiHC", "LocalSearch", "Coarser", "FullILP", "MultiLevel"}; + return {"Serial", "GreedyBsp", "GrowLocal", "BspLocking", "Cilk", "Etf", "GreedyRandom", + "GreedyChildren", "Variance", "MultiHC", "LocalSearch", "Coarser", "FullILP", "MultiLevel"}; } template @@ -247,7 +247,7 @@ RETURN_STATUS run_bsp_scheduler(const ConfigParser &parser, const boost::propert if (!status) return RETURN_STATUS::ERROR; - instance_coarse.setArchitecture(instance.getArchitecture()); + instance_coarse.getArchitecture() = instance.getArchitecture(); instance_coarse.setNodeProcessorCompatibility(instance.getProcessorCompatibilityMatrix()); BspSchedule schedule_coarse(instance_coarse); diff --git a/include/osp/auxiliary/io/arch_file_reader.hpp b/include/osp/auxiliary/io/arch_file_reader.hpp index 4e100ba8..71b0f006 100644 --- a/include/osp/auxiliary/io/arch_file_reader.hpp +++ b/include/osp/auxiliary/io/arch_file_reader.hpp @@ -18,10 +18,10 @@ limitations under the License. #pragma once +#include "osp/bsp/model/BspArchitecture.hpp" #include #include #include -#include "osp/bsp/model/BspArchitecture.hpp" namespace osp { namespace file_reader { @@ -31,7 +31,8 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture &archit // Skip comment lines while (std::getline(infile, line)) { - if (!line.empty() && line[0] != '%') break; + if (!line.empty() && line[0] != '%') + break; } // Parse architecture parameters @@ -58,24 +59,24 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture &archit if (0 <= mem_type && mem_type <= 3) { using memw_t = v_memw_t; switch (mem_type) { - case 0: - architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::NONE); - break; - case 1: - architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::LOCAL); - architecture.setMemoryBound(static_cast(M)); - break; - case 2: - architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::GLOBAL); - architecture.setMemoryBound(static_cast(M)); - break; - case 3: - architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT); - architecture.setMemoryBound(static_cast(M)); - break; - default: - std::cerr << "Invalid memory type.\n"; - return false; + case 0: + architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::NONE); + break; + case 1: + architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::LOCAL); + architecture.setMemoryBound(static_cast(M)); + break; + case 2: + architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::GLOBAL); + architecture.setMemoryBound(static_cast(M)); + break; + case 3: + architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT); + architecture.setMemoryBound(static_cast(M)); + break; + default: + std::cerr << "Invalid memory type.\n"; + return false; } } else if (mem_type == -1) { std::cout << "No memory type specified. Assuming \"NONE\".\n"; @@ -116,7 +117,7 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture &archit return false; } - architecture.setSendCosts(fromProc, toProc, static_cast>(value)); + architecture.SetSendCosts(fromProc, toProc, static_cast>(value)); } // Ensure there are no remaining non-comment lines @@ -127,7 +128,6 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture &archit } } - architecture.computeCommAverage(); return true; } diff --git a/include/osp/auxiliary/return_status.hpp b/include/osp/auxiliary/return_status.hpp new file mode 100644 index 00000000..e5f0b870 --- /dev/null +++ b/include/osp/auxiliary/return_status.hpp @@ -0,0 +1,56 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include + +namespace osp { + +enum class RETURN_STATUS { OSP_SUCCESS, + BEST_FOUND, + TIMEOUT, + ERROR }; + +/** + * @brief Converts the enum to a string literal. + * Returns const char* to avoid std::string allocation overhead. + */ +inline const char *to_string(const RETURN_STATUS status) { + switch (status) { + case RETURN_STATUS::OSP_SUCCESS: + return "SUCCESS"; + case RETURN_STATUS::BEST_FOUND: + return "BEST FOUND"; + case RETURN_STATUS::TIMEOUT: + return "TIMEOUT"; + case RETURN_STATUS::ERROR: + return "ERROR"; + default: + return "UNKNOWN"; + } +} + +/** + * @brief Stream operator overload using the helper function. + */ +inline std::ostream &operator<<(std::ostream &os, RETURN_STATUS status) { + return os << to_string(status); +} + +} // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/model/BspArchitecture.hpp b/include/osp/bsp/model/BspArchitecture.hpp index 8ac1c0a8..5575fad2 100644 --- a/include/osp/bsp/model/BspArchitecture.hpp +++ b/include/osp/bsp/model/BspArchitecture.hpp @@ -34,46 +34,51 @@ limitations under the License. namespace osp { -static constexpr unsigned CacheLineSize = 64; - +/** + * @enum MEMORY_CONSTRAINT_TYPE + * @brief Enumerates the different types of memory constraints. + * Memory bounds are set per processor and apply to aggregated memory weights of nodes according to the different types of memory constraints. + */ enum class MEMORY_CONSTRAINT_TYPE { - NONE, - LOCAL, - GLOBAL, - PERSISTENT_AND_TRANSIENT, - LOCAL_IN_OUT, - LOCAL_INC_EDGES, - LOCAL_SOURCES_INC_EDGES + NONE, /** No memory constraints. */ + LOCAL, /** The memory bounds apply to the sum of memory weights of nodes assigned to the same processor and superstep. */ + GLOBAL, /** The memory bounds apply to the sum of memory weights of the nodes assigned to the same processor. */ + PERSISTENT_AND_TRANSIENT, /** Memory bounds apply to the sum of memory weights of nodes assigned to the same processor plus the maximum communication weight of a node assigned to a processor. */ + LOCAL_IN_OUT, /** Memory constraints are local in-out. Experimental. */ + LOCAL_INC_EDGES, /** Memory constraints are local incident edges. Experimental. */ + LOCAL_SOURCES_INC_EDGES /** Memory constraints are local source incident edges. Experimental. */ }; -inline std::ostream &operator<<(std::ostream &os, MEMORY_CONSTRAINT_TYPE type) { +/** + * @brief Converts the enum to a string literal. + * Returns const char* to avoid std::string allocation overhead. + */ +inline const char *to_string(MEMORY_CONSTRAINT_TYPE type) { switch (type) { case MEMORY_CONSTRAINT_TYPE::NONE: - os << "NONE"; - break; + return "NONE"; case MEMORY_CONSTRAINT_TYPE::LOCAL: - os << "LOCAL"; - break; + return "LOCAL"; case MEMORY_CONSTRAINT_TYPE::GLOBAL: - os << "GLOBAL"; - break; + return "GLOBAL"; case MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT: - os << "PERSISTENT_AND_TRANSIENT"; - break; + return "PERSISTENT_AND_TRANSIENT"; case MEMORY_CONSTRAINT_TYPE::LOCAL_IN_OUT: - os << "LOCAL_IN_OUT"; - break; + return "LOCAL_IN_OUT"; case MEMORY_CONSTRAINT_TYPE::LOCAL_INC_EDGES: - os << "LOCAL_INC_EDGES"; - break; + return "LOCAL_INC_EDGES"; case MEMORY_CONSTRAINT_TYPE::LOCAL_SOURCES_INC_EDGES: - os << "LOCAL_SOURCES_INC_EDGES"; - break; + return "LOCAL_SOURCES_INC_EDGES"; default: - os << "UNKNOWN"; - break; + return "UNKNOWN"; } - return os; +} + +/** + * @brief Stream operator overload using the helper function. + */ +inline std::ostream &operator<<(std::ostream &os, MEMORY_CONSTRAINT_TYPE type) { + return os << to_string(type); } /** @@ -81,8 +86,29 @@ inline std::ostream &operator<<(std::ostream &os, MEMORY_CONSTRAINT_TYPE type) { * @brief Represents the architecture of a BSP (Bulk Synchronous Parallel) system. * * The BspArchitecture class stores information about the number of processors, communication costs, - * synchronization costs, and send costs between processors in a BSP system. It provides methods to - * set and retrieve these values. + * synchronization costs, the send costs between processors, the types of processors, and the memory + * bounds. It provides methods to set and retrieve these values. + * + * **Processors:** + * The architecture consists of p processors, indexed from 0 to p-1. Note that processor indices are represented using `unsigned`. + * + * **Processor Types:** + * Processors can have different types, which are represented by non-negative integers. + * Processor types are assumed to be consecutive integers starting from 0. Note that processor types are represented using `unsigned`. + * Processor types are used to express compatabilities, which can be specified in the BspInstance, regarding node types. + * + * **Communication and Synchronization Costs:** + * - Communication Cost (g): The cost of communicating a unit of data between processors, i.e., the bandwidth. + * - Synchronization Cost (L): The cost of synchronizing all processors at the end of a superstep. + * + * **Send Costs (NUMA):** + * The architecture supports Non-Uniform Memory Access (NUMA) effects via a send cost matrix. + * The cost to send data from processor i to processor j is given by g * sendCosts[i][j]. + * By default, send costs are uniform (1 for distinct processors, 0 for self). + * + * **Memory Constraints:** + * Each processor has a memory bound. The `MEMORY_CONSTRAINT_TYPE` determines how these bounds are applied + * (e.g., local per superstep, global per processor). */ template class BspArchitecture { @@ -90,84 +116,135 @@ class BspArchitecture { static_assert(is_computational_dag_v, "BspSchedule can only be used with computational DAGs."); private: - unsigned number_processors; - unsigned number_of_processor_types; + /** @brief The number of processors in the architecture. Must be at least 1. */ + unsigned numberOfProcessors_; + + /** @brief The number of processor types in the architecture. See processorTypes_ for more details. */ + unsigned numberOfProcessorTypes_; + + /** @brief The communication costs, typically denoted 'g' for the BSP model. */ + v_commw_t communicationCosts_; - v_commw_t communication_costs; - v_commw_t synchronisation_costs; + /** @brief The synchronisation costs, typically denoted 'L' for the BSP model. */ + v_commw_t synchronisationCosts_; - std::vector> memory_bound; + /** @brief The architecture allows to specify memory bounds per processor. */ + std::vector> memoryBound_; - bool isNuma; + /** @brief Flag to indicate whether the architecture is NUMA , i.e., whether the send costs are different for different pairs of processors. */ + bool isNuma_; - std::vector processor_type; + /** @brief The architecture allows to specify processor types. Processor types are used to express compatabilities, which can be specified in the BspInstance, regarding node types. */ + std::vector processorTypes_; - std::vector>> send_costs; + /** @brief A flattened p x p matrix of send costs. Access via index [i * numberOfProcessors_ + j]. */ + std::vector> sendCosts_; - MEMORY_CONSTRAINT_TYPE memory_const_type = MEMORY_CONSTRAINT_TYPE::NONE; + /** @brief The memory constraint type. */ + MEMORY_CONSTRAINT_TYPE memoryConstraintType_ = MEMORY_CONSTRAINT_TYPE::NONE; + + /** @brief Helper function to calculate the index of a flattened p x p matrix. */ + std::size_t FlatIndex(const unsigned row, const unsigned col) const { + return static_cast(row) * numberOfProcessors_ + col; + } - bool are_send_cost_numa() { - if (number_processors == 1) + bool AreSendCostsNuma() { + if (numberOfProcessors_ == 1U) return false; - v_commw_t val = send_costs[0][1]; - for (unsigned p1 = 0; p1 < number_processors; p1++) { - for (unsigned p2 = 0; p2 < number_processors; p2++) { + const v_commw_t val = sendCosts_[1U]; + for (unsigned p1 = 0U; p1 < numberOfProcessors_; p1++) { + for (unsigned p2 = 0U; p2 < numberOfProcessors_; p2++) { if (p1 == p2) continue; - if (send_costs[p1][p2] != val) + if (sendCosts_[FlatIndex(p1, p2)] != val) return true; } } return false; } - public: - BspArchitecture() - : number_processors(2), number_of_processor_types(1), communication_costs(1), synchronisation_costs(2), - memory_bound(std::vector>(number_processors, 100)), isNuma(false), - processor_type(std::vector(number_processors, 0)), - send_costs(std::vector>>( - number_processors, std::vector>(number_processors, 1))) { - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; + void UpdateNumberOfProcessorTypes() { + numberOfProcessorTypes_ = 0U; + for (unsigned p = 0U; p < numberOfProcessors_; p++) { + if (processorTypes_[p] >= numberOfProcessorTypes_) { + numberOfProcessorTypes_ = processorTypes_[p] + 1U; + } } } - BspArchitecture(const BspArchitecture &other) = default; - BspArchitecture(BspArchitecture &&other) = default; - BspArchitecture &operator=(const BspArchitecture &other) = default; - BspArchitecture &operator=(BspArchitecture &&other) = default; - ~BspArchitecture() = default; + void SetSendCostDiagonalToZero() { + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + sendCosts_[FlatIndex(i, i)] = 0U; + } + } + + void InitializeUniformSendCosts() { + sendCosts_.assign(numberOfProcessors_ * numberOfProcessors_, 1U); + SetSendCostDiagonalToZero(); + isNuma_ = false; + } + public: /** * @brief Constructs a BspArchitecture object with the specified number of processors, communication cost, and * synchronization cost. * - * @param processors The number of processors in the architecture. - * @param comm_cost The communication cost between processors. - * @param synch_cost The synchronization cost between processors. - */ - BspArchitecture(unsigned processors, v_commw_t comm_cost, v_commw_t synch_cost, - v_memw_t memory_bound_ = 100) - : number_processors(processors), number_of_processor_types(1), communication_costs(comm_cost), - synchronisation_costs(synch_cost), - memory_bound(std::vector>(number_processors, memory_bound_)), isNuma(false), - processor_type(std::vector(number_processors, 0)), - send_costs(std::vector>>( - number_processors, std::vector>(number_processors, 1))) { - - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; + * @param NumberOfProcessors The number of processors in the architecture. Must be greater than 0. Default: 2. + * @param CommunicationCost The communication cost between processors. Default: 1. + * @param SynchronisationCost The synchronization cost between processors. Default: 2. + * @param MemoryBound The memory bound for each processor (default: 100). + * @param SendCosts The matrix of send costs between processors. Needs to be a processors x processors matrix. Diagonal entries are forced to zero. Default: empty (uniform costs). + */ + BspArchitecture(const unsigned NumberOfProcessors = 2U, const v_commw_t CommunicationCost = 1U, const v_commw_t SynchronisationCost = 2U, + const v_memw_t MemoryBound = 100U, const std::vector>> &SendCosts = {}) + : numberOfProcessors_(NumberOfProcessors), numberOfProcessorTypes_(1U), communicationCosts_(CommunicationCost), + synchronisationCosts_(SynchronisationCost), + memoryBound_(NumberOfProcessors, MemoryBound), isNuma_(false), + processorTypes_(NumberOfProcessors, 0U) { + if (NumberOfProcessors == 0U) { + throw std::runtime_error("BspArchitecture: Number of processors must be greater than 0."); + } + + if (SendCosts.empty()) { + InitializeUniformSendCosts(); + } else { + if (NumberOfProcessors != SendCosts.size()) { + throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n"); + } + if (std::any_of(SendCosts.begin(), SendCosts.end(), + [NumberOfProcessors](const auto &thing) { return thing.size() != NumberOfProcessors; })) { + throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n"); + } + + sendCosts_.reserve(NumberOfProcessors * NumberOfProcessors); + for (const auto &row : SendCosts) { + sendCosts_.insert(sendCosts_.end(), row.begin(), row.end()); + } + + SetSendCostDiagonalToZero(); + isNuma_ = AreSendCostsNuma(); } } + BspArchitecture(const BspArchitecture &other) = default; + BspArchitecture(BspArchitecture &&other) noexcept = default; + BspArchitecture &operator=(const BspArchitecture &other) = default; + BspArchitecture &operator=(BspArchitecture &&other) noexcept = default; + virtual ~BspArchitecture() = default; + + /** + * @brief Copy constructor from a BspArchitecture with a different graph type. + * + * @tparam Graph_t_other The graph type of the other BspArchitecture. + * @param other The other BspArchitecture object. + */ template BspArchitecture(const BspArchitecture &other) - : number_processors(other.numberOfProcessors()), number_of_processor_types(other.getNumberOfProcessorTypes()), - communication_costs(other.communicationCosts()), synchronisation_costs(other.synchronisationCosts()), - memory_bound(other.memoryBound()), isNuma(other.isNumaArchitecture()), processor_type(other.processorTypes()), - send_costs(other.sendCosts()) { + : numberOfProcessors_(other.numberOfProcessors()), numberOfProcessorTypes_(other.getNumberOfProcessorTypes()), + communicationCosts_(other.communicationCosts()), synchronisationCosts_(other.synchronisationCosts()), + memoryBound_(other.memoryBound()), isNuma_(other.isNumaArchitecture()), processorTypes_(other.processorTypes()), + sendCosts_(other.sendCostsVector()) { static_assert(std::is_same_v, v_memw_t>, "BspArchitecture: Graph_t and Graph_t_other have the same memory weight type."); @@ -180,81 +257,32 @@ class BspArchitecture { } /** - * @brief Constructs a BspArchitecture object with the specified number of processors, communication cost, and - * synchronization cost. - * - * @param processors The number of processors in the architecture. - * @param comm_cost The communication cost between processors. - * @param synch_cost The synchronization cost between processors. - */ - BspArchitecture(unsigned int processors, v_commw_t comm_cost, v_commw_t synch_cost, - std::vector>> send_costs_) - : number_processors(processors), number_of_processor_types(1), communication_costs(comm_cost), - synchronisation_costs(synch_cost), memory_bound(std::vector>(number_processors, 100)), - processor_type(std::vector(number_processors, 0)), send_costs(send_costs_) { - - if (number_processors != send_costs.size()) { - throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n"); - } - if (std::any_of(send_costs.begin(), send_costs.end(), - [processors](const auto &thing) { return thing.size() != processors; })) { - throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n"); - } - - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; - } - - isNuma = are_send_cost_numa(); - } - - /** - * @brief Constructs a BspArchitecture object with the specified number of processors, communication cost, and - * synchronization cost. + * @brief Constructs a BspArchitecture object with custom send costs. * - * @param processors The number of processors in the architecture. - * @param comm_cost The communication cost between processors. - * @param synch_cost The synchronization cost between processors. - */ - BspArchitecture(unsigned int processors, v_commw_t comm_cost, v_commw_t synch_cost, - v_memw_t memory_bound_, std::vector>> send_costs_) - : number_processors(processors), number_of_processor_types(1), communication_costs(comm_cost), - synchronisation_costs(synch_cost), - memory_bound(std::vector>(number_processors, memory_bound_)), - processor_type(std::vector(number_processors, 0)), send_costs(send_costs_) { - - if (number_processors != send_costs.size()) { - throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n"); - } - if (std::any_of(send_costs.begin(), send_costs.end(), - [processors](const auto &thing) { return thing.size() != processors; })) { - throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n"); - } - - for (unsigned i = 0u; i < number_processors; i++) { - send_costs[i][i] = 0u; - } - - isNuma = are_send_cost_numa(); - } + * @param NumberOfProcessors The number of processors. Must be greater than 0. + * @param CommunicationCost The communication cost. + * @param SynchronisationCost The synchronization cost. + * @param SendCosts The matrix of send costs between processors. Needs to be a processors x processors matrix. Diagonal entries are forced to zero. + */ + BspArchitecture(const unsigned NumberOfProcessors, const v_commw_t CommunicationCost, const v_commw_t SynchronisationCost, + const std::vector>> &SendCosts) + : BspArchitecture(NumberOfProcessors, CommunicationCost, SynchronisationCost, 100U, SendCosts) {} /** - * Sets the uniform send cost for each pair of processors in the BSP architecture. + * @brief Sets the uniform send cost for each pair of processors. * The send cost is set to 0 if the processors are the same, and 1 otherwise. - * This function assumes that the number of processors has already been set. */ void SetUniformSendCost() { - - for (unsigned i = 0; i < number_processors; i++) { - for (unsigned j = 0; j < number_processors; j++) { + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + for (unsigned j = 0U; j < numberOfProcessors_; j++) { if (i == j) { - send_costs[i][j] = 0; + sendCosts_[FlatIndex(i, j)] = 0U; } else { - send_costs[i][j] = 1; + sendCosts_[FlatIndex(i, j)] = 1U; } } } - isNuma = false; + isNuma_ = false; } /** @@ -265,80 +293,59 @@ class BspArchitecture { * * @param base The base value used to calculate the send cost. */ - void SetExpSendCost(v_commw_t base) { - - isNuma = true; + void SetExpSendCost(const v_commw_t base) { + isNuma_ = true; unsigned maxPos = 1; constexpr unsigned two = 2; - for (; intpow(two, maxPos + 1) <= number_processors - 1; ++maxPos) { + for (; intpow(two, maxPos + 1) <= numberOfProcessors_ - 1; ++maxPos) { } - for (unsigned i = 0; i < number_processors; ++i) - for (unsigned j = i + 1; j < number_processors; ++j) - for (unsigned pos = maxPos; pos <= maxPos; --pos) - if (((1 << pos) & i) != ((1 << pos) & j)) { - send_costs[i][j] = send_costs[j][i] = intpow(base, pos); + + for (unsigned i = 0U; i < numberOfProcessors_; ++i) { + for (unsigned j = i + 1U; j < numberOfProcessors_; ++j) { + // Corrected loop to avoid underflow issues with unsigned + for (int pos = static_cast(maxPos); pos >= 0; --pos) { + if (((1U << pos) & i) != ((1U << pos) & j)) { + sendCosts_[FlatIndex(i, j)] = sendCosts_[FlatIndex(j, i)] = intpow(base, static_cast(pos)); break; } + } + } + } } - inline auto processors() const { return integral_range(number_processors); } - /** - * @brief Computes the average communication cost of the BspArchitecture. - * - * This function computes the average communication cost of the BspArchitecture object. - * The average communication cost is calculated as the sum of the send costs between processors divided by the - * number of processors. - * - * @return The average communication cost as an unsigned integer. + * @brief Returns a view of processor indices from 0 to numberOfProcessors_ - 1. + * @return An integral view of processor indices. */ - v_commw_t computeCommAverage() const { - - double avg = 0; - for (unsigned i = 0; i < number_processors; ++i) - for (unsigned j = 0; j < number_processors; ++j) - avg += static_cast(send_costs[i][j]); - avg = avg * static_cast(communication_costs) / static_cast(number_processors) / static_cast(number_processors); - - if (avg > static_cast(std::numeric_limits::max())) { - throw std::invalid_argument("avg comm exceeds the limit (something is very wrong)"); - } - - return static_cast>(std::round(avg)); - } + [[nodiscard]] auto processors() const { return integral_range(numberOfProcessors_); } /** - * Sets the send costs for the BspArchitecture. + * @brief Sets the send costs for the BspArchitecture. * * @param vec A 2D vector representing the send costs between processors. - * The size of the vector must be equal to the number of processors. - * Each inner vector must also have a size equal to the number of processors. - * @throws std::invalid_argument if the size of the vector or inner vectors is invalid. + * @throws std::invalid_argument if the size of the vector is invalid or diagonal elements are not 0. */ - void setSendCosts(const std::vector>> &vec) { - - if (vec.size() != number_processors) { - throw std::invalid_argument("Invalid Argument"); + void SetSendCosts(const std::vector>> &vec) { + if (vec.size() != numberOfProcessors_) { + throw std::invalid_argument("Invalid Argument: Vector size mismatch."); } - isNuma = false; - for (unsigned i = 0; i < number_processors; i++) { - - if (vec[i].size() != number_processors) { - throw std::invalid_argument("Invalid Argument"); + isNuma_ = false; + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + if (vec.at(i).size() != numberOfProcessors_) { + throw std::invalid_argument("Invalid Argument: Inner vector size mismatch."); } - for (unsigned j = 0; j < number_processors; j++) { - + for (unsigned j = 0U; j < numberOfProcessors_; j++) { if (i == j) { - if (vec[i][j] != 0) - throw std::invalid_argument("Invalid Argument, Diagonal elements should be 0"); + if (vec.at(i).at(j) != 0U) + throw std::invalid_argument("Invalid Argument: Diagonal elements should be 0."); } else { - send_costs[i][j] = vec[i][j]; + sendCosts_.at(FlatIndex(i, j)) = vec.at(i).at(j); - if (number_processors > 1 && vec[i][j] != vec[0][1]) { - isNuma = true; + if (numberOfProcessors_ > 1U && vec.at(i).at(j) != vec.at(0U).at(1U)) { + isNuma_ = true; } } } @@ -346,324 +353,310 @@ class BspArchitecture { } /** - * Sets the send costs between two processors. + * @brief Sets the send costs between two processors. * - * @param p1 The index of the first processor. - * @param p2 The index of the second processor. + * @param p1 The index of the first processor. Must be less than numberOfProcessors_. + * @param p2 The index of the second processor. Must be less than numberOfProcessors_. * @param cost The cost of sending data between the processors. - * - * @remarks If the two processors are the same, the send cost is not set. - * If the cost is not equal to 1, the architecture is considered NUMA. + * @throws std::invalid_argument if the processor indices are out of bounds. */ - void setSendCosts(unsigned p1, unsigned p2, v_commw_t cost) { - - if (p1 >= number_processors || p2 > number_processors) - throw std::invalid_argument("Invalid Argument"); + void SetSendCosts(const unsigned p1, const unsigned p2, const v_commw_t cost) { + if (p1 >= numberOfProcessors_ || p2 >= numberOfProcessors_) // Fixed condition: p2 >= number_processors + throw std::invalid_argument("Invalid Argument: Processor index out of bounds."); if (p1 != p2) { - send_costs[p1][p2] = cost; - - isNuma = are_send_cost_numa(); + sendCosts_.at(FlatIndex(p1, p2)) = cost; + isNuma_ = AreSendCostsNuma(); } } /** - * Sets the memory bound for all processors of the BspArchitecture. - * - * @param memory_bound_ The new memory bound for all processors. + * @brief Sets the memory bound for all processors. + * @param MemoryBound The new memory bound for all processors. */ - inline void setMemoryBound(v_memw_t memory_bound_) { - memory_bound = std::vector>(number_processors, memory_bound_); + void setMemoryBound(const v_memw_t MemoryBound) { + memoryBound_.assign(numberOfProcessors_, MemoryBound); } - inline void setMemoryBound(const std::vector> &memory_bound_) { memory_bound = memory_bound_; } - - inline void setMemoryBound(v_memw_t memory_bound_, unsigned proc) { - - if (proc >= number_processors) { - throw std::invalid_argument("Invalid Argument setMemoryBound"); + /** + * @brief Sets the memory bound for all processors using a vector. + * @param MemoryBound The vector of memory bounds. + * @throws std::invalid_argument if the size of the vector is invalid. + */ + void setMemoryBound(const std::vector> &MemoryBound) { + if (MemoryBound.size() != numberOfProcessors_) { + throw std::invalid_argument("Invalid Argument: Memory bound vector size does not match number of processors."); } + memoryBound_ = MemoryBound; + } - memory_bound[proc] = memory_bound_; + /** + * @brief Sets the memory bound for a specific processor. + * @param MemoryBound The new memory bound for the processor. + * @param processorIndex The processor index. Must be less than numberOfProcessors_. + */ + void setMemoryBound(const v_memw_t MemoryBound, const unsigned processorIndex) { + memoryBound_.at(processorIndex) = MemoryBound; } /** - * @brief Sets the synchronization costs for the BspArchitecture. - * - * This function sets the synchronization costs for the BspArchitecture object. - * The synchronization costs represent the costs of establishing communication between processors. - * - * @param synch_cost The synchronization costs to be set. + * @brief Sets the synchronization costs. + * @param SynchCost The new synchronization costs. */ - inline void setSynchronisationCosts(v_commw_t synch_cost) { synchronisation_costs = synch_cost; } + void setSynchronisationCosts(const v_commw_t SynchCost) { synchronisationCosts_ = SynchCost; } /** - * @brief Sets the communication costs for the BspArchitecture. - * - * This function sets the communication costs for the BspArchitecture object. - * The communication costs represent the costs of sending messages between processors. - * - * @param comm_cost The communication costs to be set. + * @brief Sets the communication costs. + * @param CommCost The new communication costs. */ - inline void setCommunicationCosts(v_commw_t comm_cost) { communication_costs = comm_cost; } + void setCommunicationCosts(const v_commw_t CommCost) { communicationCosts_ = CommCost; } /** - * @brief Sets the number of processors in the BSP architecture. - * - * This function sets the number of processors in the BSP architecture and sets the send costs between processors - * to 1. The send_costs matrix represents the costs of sending messages between processors. The diagonal elements of - * the matrix are set to 0, indicating that there is no cost to send a message from a processor to itself. - * - * @param num_proc The number of processors in the BSP architecture. + * @brief Checks if the architecture is NUMA. + * @return True if NUMA, false otherwise. */ - void setNumberOfProcessors(unsigned num_proc) { + [[nodiscard]] bool isNumaArchitecture() const { return isNuma_; } - number_processors = num_proc; - number_of_processor_types = 1; - processor_type = std::vector(number_processors, 0); - send_costs = std::vector>>( - number_processors, std::vector>(number_processors, 1)); - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; + /** + * @brief Sets the number of processors. Processor type is set to 0 for all processors. + * Resets send costs to uniform (1) and diagonal to 0. The memory bound is set to 100 for all processors. + * @param numberOfProcessors The number of processors. Must be greater than 0. + * @throws std::invalid_argument if the number of processors is 0. + */ + void setNumberOfProcessors(const unsigned numberOfProcessors) { + if (numberOfProcessors == 0) { + throw std::invalid_argument("Invalid Argument: Number of processors must be greater than 0."); } - memory_bound.resize(num_proc, memory_bound.back()); + numberOfProcessors_ = numberOfProcessors; + numberOfProcessorTypes_ = 1U; + processorTypes_.assign(numberOfProcessors_, 0U); + + InitializeUniformSendCosts(); - isNuma = false; + // initialize memory bound to 100 for all processors + memoryBound_.assign(numberOfProcessors_, 100U); } /** - * @brief Sets the number of processors and their types in the BSP architecture. - * - * This function sets the number of processors in the BSP architecture and sets the send costs between processors - * to 1. The send_costs matrix represents the costs of sending messages between processors. The diagonal elements of - * the matrix are set to 0, indicating that there is no cost to send a message from a processor to itself. - * - * @param processor_types_ The type of the respective processors. + * @brief Sets the number of processors and their types. Number of processors is set to the size of the processor types vector. + * Resets send costs to uniform (1). Resets memory bound to 100 for all processors. + * @param processorTypes The types of the respective processors. */ - void setProcessorsWithTypes(const std::vector> &processor_types_) { - - if (processor_types_.size() > std::numeric_limits::max()) { - throw std::invalid_argument("Invalid Argument, number of processors exceeds the limit"); + void setProcessorsWithTypes(const std::vector> &processorTypes) { + if (processorTypes.empty()) { + throw std::invalid_argument("Invalid Argument: Processor types vector is empty."); } - - number_processors = static_cast(processor_types_.size()); - - number_of_processor_types = 0; - processor_type = processor_types_; - send_costs = std::vector>>( - number_processors, std::vector>(number_processors, 1)); - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; + if (processorTypes.size() > std::numeric_limits::max()) { + throw std::invalid_argument("Invalid Argument: Number of processors exceeds the limit."); } - memory_bound.resize(number_processors, memory_bound.back()); + numberOfProcessors_ = static_cast(processorTypes.size()); + processorTypes_ = processorTypes; - isNuma = false; - updateNumberOfProcessorTypes(); + InitializeUniformSendCosts(); + + // initialize memory bound to 100 for all processors + memoryBound_.assign(numberOfProcessors_, 100U); + UpdateNumberOfProcessorTypes(); } /** - * Returns whether the architecture is NUMA. - * - * @return True if the architecture is NUMA, false otherwise. + * @brief Sets processors based on counts of consecutive types. + * The architecture will have processorTypeCount[0] processors of type 0, processorTypeCount[1] processors of type 1, etc. + * The memory bound for each processor of type i is set to processorTypeMemory[i]. + * The send costs are set to uniform (1). + * @param processorTypeCount Vector where index is type and value is count of processors of that type. + * @param processorTypeMemory Vector where index is type and value is memory bound for that type. */ - inline bool isNumaArchitecture() const { return isNuma; } - - void set_processors_consequ_types(const std::vector> &processor_type_count_, - const std::vector> &processor_type_memory_) { - - if (processor_type_count_.size() != processor_type_memory_.size()) { - throw std::invalid_argument( - "Invalid Argument, processor_type_count_ and processor_type_memory_ must have the same size"); + void SetProcessorsConsequTypes(const std::vector> &processorTypeCount, + const std::vector> &processorTypeMemory) { + if (processorTypeCount.size() != processorTypeMemory.size()) { + throw std::invalid_argument("Invalid Argument: processorTypeCount and processorTypeMemory must have the same size."); } - if (processor_type_count_.size() > std::numeric_limits::max()) { - throw std::invalid_argument("Invalid Argument, number of processors exceeds the limit"); + if (processorTypeCount.size() > std::numeric_limits::max()) { + throw std::invalid_argument("Invalid Argument: Number of processors exceeds the limit."); } - number_of_processor_types = static_cast(processor_type_count_.size()); - number_processors = std::accumulate(processor_type_count_.begin(), processor_type_count_.end(), 0u); - - processor_type = std::vector>(number_processors, 0); - memory_bound = std::vector>(number_processors, 0); + numberOfProcessorTypes_ = static_cast(processorTypeCount.size()); + numberOfProcessors_ = std::accumulate(processorTypeCount.begin(), processorTypeCount.end(), 0U); - unsigned offset = 0; - for (unsigned i = 0; i < processor_type_count_.size(); i++) { + // initialize processor types and memory bound + processorTypes_.assign(numberOfProcessors_, 0U); + memoryBound_.assign(numberOfProcessors_, 0U); - for (unsigned j = 0; j < processor_type_count_[i]; j++) { - processor_type[offset + j] = i; - memory_bound[offset + j] = processor_type_memory_[i]; + unsigned offset = 0U; + for (unsigned i = 0U; i < processorTypeCount.size(); i++) { + for (unsigned j = 0U; j < processorTypeCount.at(i); j++) { + processorTypes_.at(offset + j) = i; + memoryBound_.at(offset + j) = processorTypeMemory.at(i); } - offset += processor_type_count_[i]; + offset += processorTypeCount.at(i); } - send_costs = std::vector>>( - number_processors, std::vector>(number_processors, 1)); - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; - } - isNuma = false; + InitializeUniformSendCosts(); } /** - * Returns the memory bound of the BspArchitecture. - * - * @return The memory bound as an unsigned integer. + * @brief Returns the memory bounds of all processors. + * @return Vector of memory bounds. */ - inline const std::vector> &memoryBound() const { return memory_bound; } + [[nodiscard]] const std::vector> &memoryBound() const { return memoryBound_; } - inline v_memw_t memoryBound(unsigned proc) const { return memory_bound[proc]; } + /** + * @brief Returns the memory bound of a specific processor. + * @param proc The processor index. + * @return The memory bound. + */ + [[nodiscard]] v_memw_t memoryBound(const unsigned proc) const { return memoryBound_[proc]; } - v_memw_t minMemoryBound() const { return *(std::min_element(memory_bound.begin(), memory_bound.end())); } - v_memw_t maxMemoryBound() const { return *(std::max_element(memory_bound.begin(), memory_bound.end())); } - v_memw_t sumMemoryBound() const { return std::accumulate(memory_bound.begin(), memory_bound.end(), 0); } + /** + * @brief Returns the maximum memory bound over all processors. + * @return The maximum memory bound. + */ + [[nodiscard]] v_memw_t maxMemoryBound() const { return *(std::max_element(memoryBound_.begin(), memoryBound_.end())); } - v_memw_t maxMemoryBoundProcType(v_type_t procType) const { - v_memw_t max_mem = 0; - for (unsigned proc = 0; proc < number_processors; proc++) { - if (processor_type[proc] == procType) { - max_mem = std::max(max_mem, memory_bound[proc]); + /** + * @brief Returns the maximum memory bound over all processors of a specific type. + * + * @param procType The processor type. + * @return The maximum memory bound. + */ + [[nodiscard]] v_memw_t maxMemoryBoundProcType(const v_type_t procType) const { + v_memw_t max_mem = 0U; + for (unsigned proc = 0U; proc < numberOfProcessors_; proc++) { + if (processorTypes_[proc] == procType) { + max_mem = std::max(max_mem, memoryBound_[proc]); } } return max_mem; } /** - * Returns the number of processors in the architecture. - * + * @brief Returns the number of processors. * @return The number of processors. */ - inline unsigned numberOfProcessors() const { return number_processors; } + [[nodiscard]] unsigned numberOfProcessors() const { return numberOfProcessors_; } /** - * Returns the communication costs of the BSP architecture. - * - * @return The communication costs as an unsigned integer. + * @brief Returns the communication costs. + * @return The communication costs. */ - inline v_commw_t communicationCosts() const { return communication_costs; } + [[nodiscard]] v_commw_t communicationCosts() const { return communicationCosts_; } /** - * Returns the synchronization costs of the BspArchitecture. - * - * @return The synchronization costs as an unsigned integer. + * @brief Returns the synchronization costs. + * @return The synchronization costs. */ - inline v_commw_t synchronisationCosts() const { return synchronisation_costs; } + [[nodiscard]] v_commw_t synchronisationCosts() const { return synchronisationCosts_; } /** - * Returns a copy of the send costs matrix. - * - * @return A copy of the send costs matrix. + * @brief Returns a the send costs matrix. Internally the matrix is stored as a flattened matrix. The allocates, computes and returns the matrix on the fly. + * @return The send costs matrix. */ - inline std::vector>> sendCostMatrixCopy() const { return send_costs; } + [[nodiscard]] std::vector>> sendCost() const { + std::vector>> matrix(numberOfProcessors_, std::vector>(numberOfProcessors_)); + for (unsigned i = 0; i < numberOfProcessors_; ++i) { + for (unsigned j = 0; j < numberOfProcessors_; ++j) { + matrix[i][j] = sendCosts_[FlatIndex(i, j)]; + } + } + return matrix; + } /** - * Returns a reference to the send costs matrix. - * - * @return A reference to the send costs matrix. + * @brief Returns the flattened send costs vector. + * @return The send costs vector. */ - inline const std::vector>> &sendCostMatrix() const { return send_costs; } + [[nodiscard]] const std::vector> &sendCostsVector() const { return sendCosts_; } - // the type indeces of the processor (e.g. CPU, vector/tensor core) - inline const std::vector &processorTypes() const { return processor_type; } + /** + * @brief Returns the processor types. + * @return Vector of processor types. + */ + [[nodiscard]] const std::vector &processorTypes() const { return processorTypes_; } /** - * Returns the communication costs between two processors. The communication costs are the send costs multiplied by - * the communication costs. + * @brief Returns the communication costs between two processors. Does not perform bounds checking. + * The communication costs are the send costs multiplied by the communication costs factor. * * @param p1 The index of the first processor. * @param p2 The index of the second processor. - * - * @return The send costs between the two processors. + * @return The communication costs between the two processors. */ - inline v_commw_t communicationCosts(unsigned p1, unsigned p2) const { - return communication_costs * send_costs[p1][p2]; + [[nodiscard]] v_commw_t communicationCosts(const unsigned p1, const unsigned p2) const { + return communicationCosts_ * sendCosts_[FlatIndex(p1, p2)]; } /** - * Returns the send costs between two processors. + * @brief Returns the send costs between two processors. Does not perform bounds checking. + * Does not the communication costs into account. * * @param p1 The index of the first processor. * @param p2 The index of the second processor. - * * @return The send costs between the two processors. */ - inline v_commw_t sendCosts(unsigned p1, unsigned p2) const { return send_costs[p1][p2]; } - - inline auto sendCosts() const { return send_costs; } - - // the type index of the processor (e.g. CPU, vector/tensor core) - inline v_type_t processorType(unsigned p1) const { return processor_type[p1]; } + [[nodiscard]] v_commw_t sendCosts(const unsigned p1, const unsigned p2) const { return sendCosts_[FlatIndex(p1, p2)]; } - void setProcessorType(unsigned p1, v_type_t type) { - - if (p1 >= number_processors) - throw std::invalid_argument("Invalid Argument"); + /** + * @brief Returns the type of a specific processor. Does not perform bounds checking. + * @param p1 The processor index. + * @return The processor type. + */ + [[nodiscard]] v_type_t processorType(const unsigned p1) const { return processorTypes_[p1]; } - processor_type[p1] = type; - number_of_processor_types = std::max(number_of_processor_types, type + 1u); + /** + * @brief Sets the type of a specific processor. Performs bounds checking. + * @param p1 The processor index. + * @param type The new processor type. + */ + void setProcessorType(const unsigned p1, const v_type_t type) { + processorTypes_.at(p1) = type; + numberOfProcessorTypes_ = std::max(numberOfProcessorTypes_, type + 1U); } - std::vector getProcessorTypeCount() const { - - std::vector type_count(number_of_processor_types, 0u); - for (unsigned p = 0u; p < number_processors; p++) { - type_count[processor_type[p]]++; + /** + * @brief Returns the count of processors for each type. + * @return Vector where index is type and value is count. + */ + [[nodiscard]] std::vector getProcessorTypeCount() const { + std::vector type_count(numberOfProcessorTypes_, 0U); + for (unsigned p = 0U; p < numberOfProcessors_; p++) { + type_count[processorTypes_[p]]++; } return type_count; } - unsigned getMinProcessorTypeCount() const { - const auto &type_count = getProcessorTypeCount(); - if (type_count.empty()) { - return 0; - } - return *std::min_element(type_count.begin(), type_count.end()); - } - - void print_architecture(std::ostream &os) const { - - os << "Architectur info: number of processors: " << number_processors - << ", Number of processor types: " << number_of_processor_types - << ", Communication costs: " << communication_costs << ", Synchronization costs: " << synchronisation_costs - << std::endl; + /** + * @brief Prints the architecture details to the output stream. + * @param os The output stream. + */ + void print(std::ostream &os) const { + os << "Architecture info: number of processors: " << numberOfProcessors_ + << ", Number of processor types: " << numberOfProcessorTypes_ + << ", Communication costs: " << communicationCosts_ << ", Synchronization costs: " << synchronisationCosts_ + << "\n"; os << std::setw(17) << " Processor: "; - for (unsigned i = 0; i < number_processors; i++) { + for (unsigned i = 0U; i < numberOfProcessors_; i++) { os << std::right << std::setw(5) << i << " "; } - os << std::endl; + os << "\n"; os << std::setw(17) << "Processor type: "; - for (unsigned i = 0; i < number_processors; i++) { - os << std::right << std::setw(5) << processor_type[i] << " "; + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + os << std::right << std::setw(5) << processorTypes_.at(i) << " "; } - os << std::endl; + os << "\n"; os << std::setw(17) << "Memory bound: "; - for (unsigned i = 0; i < number_processors; i++) { - os << std::right << std::setw(5) << memory_bound[i] << " "; - } - os << std::endl; - } - - void updateNumberOfProcessorTypes() { - number_of_processor_types = 0; - for (unsigned p = 0; p < number_processors; p++) { - if (processor_type[p] >= number_of_processor_types) { - number_of_processor_types = processor_type[p] + 1; - } - } - } - - std::vector> getProcessorIdsByType() const { - std::vector> processor_ids_by_type(number_of_processor_types); - for (unsigned i = 0; i < numberOfProcessors(); ++i) { - processor_ids_by_type[processorType(i)].push_back(i); + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + os << std::right << std::setw(5) << memoryBound_.at(i) << " "; } - return processor_ids_by_type; + os << "\n"; } - inline unsigned getNumberOfProcessorTypes() const { return number_of_processor_types; }; + [[nodiscard]] unsigned getNumberOfProcessorTypes() const { return numberOfProcessorTypes_; }; - inline MEMORY_CONSTRAINT_TYPE getMemoryConstraintType() const { return memory_const_type; } - inline void setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE memory_const_type_) { - memory_const_type = memory_const_type_; + [[nodiscard]] MEMORY_CONSTRAINT_TYPE getMemoryConstraintType() const { return memoryConstraintType_; } + void setMemoryConstraintType(const MEMORY_CONSTRAINT_TYPE memoryConstraintType) { + memoryConstraintType_ = memoryConstraintType; } }; diff --git a/include/osp/bsp/model/BspInstance.hpp b/include/osp/bsp/model/BspInstance.hpp index 4e31d145..bed4fd40 100644 --- a/include/osp/bsp/model/BspInstance.hpp +++ b/include/osp/bsp/model/BspInstance.hpp @@ -27,54 +27,73 @@ limitations under the License. namespace osp { -enum class RETURN_STATUS { OSP_SUCCESS, BEST_FOUND, TIMEOUT, ERROR }; - -inline std::string to_string(const RETURN_STATUS status) { - switch (status) { - case RETURN_STATUS::OSP_SUCCESS: - return "SUCCESS"; - case RETURN_STATUS::BEST_FOUND: - return "BEST FOUND"; - case RETURN_STATUS::TIMEOUT: - return "TIMEOUT"; - case RETURN_STATUS::ERROR: - return "ERROR"; - default: - return "UNKNOWN"; - } -} - -inline std::ostream& operator<<(std::ostream& os, RETURN_STATUS status) { - switch (status) { - case RETURN_STATUS::OSP_SUCCESS: os << "SUCCESS"; break; - case RETURN_STATUS::BEST_FOUND: os << "BEST_FOUND"; break; - case RETURN_STATUS::TIMEOUT: os << "TIMEOUT"; break; - case RETURN_STATUS::ERROR: os << "ERROR"; break; - default: os << "UNKNOWN"; break; - } - return os; -} - /** * @class BspInstance - * @brief Represents an instance of the BSP (Bulk Synchronous Parallel) model. + * @brief Represents a scheduling problem instance for the Bulk Synchronous Parallel (BSP) model. + * + * The BspInstance class serves as a container for all the necessary information to define a + * BSP scheduling problem. It acts as the "ground" object that holds the actual implementation + * of the graph and architecture. + * + * It aggregates three main components: + * + * 1. **Computational DAG**: The directed acyclic graph representing the program to be executed. + * It defines the tasks (nodes), their dependencies (directed edges), and associated weights (work, memory, communication). * - * The BspInstance class encapsulates the computational DAG (Directed Acyclic Graph) and the BSP architecture - * for a specific instance of the BSP model. It provides methods to access and modify the architecture and DAG, - * as well as retrieve information about the instance such as the number of vertices and processors. + * 2. **BSP Architecture**: The hardware model description, including the number of processors, + * their types, memory bounds, and communication/synchronization costs. + * Note that processor indices are represented using `unsigned`. + * + * 3. **Node-Processor Compatibility**: A matrix defining which node types can be executed on which + * processor types. This enables the modeling of heterogeneous systems (e.g., CPU + GPU) where + * certain nodes are restricted to specific hardware accelerators. + * + * @warning Be careful when assigning an existing graph to a BspInstance. Depending on the + * constructor or assignment operator used, this may result in a deep copy of the graph structure, + * which can be expensive for large graphs. + * + * This class provides a unified interface to access and modify these components, facilitating + * the development of scheduling algorithms that need to query problem constraints and properties. + * + * @tparam Graph_t The type of the computational DAG, which must satisfy the `is_computational_dag` concept. */ template class BspInstance { - - static_assert(is_computational_dag_v, "BspSchedule can only be used with computational DAGs."); + static_assert(is_computational_dag_v, "BspInstance can only be used with computational DAGs."); private: + /** + * @brief The computational DAG representing the program structure. + * + * It contains the graph topology (nodes and directed edges) as well as attributes such as node types, + * work weights, memory weights, and edge communication weights. + */ Graph_t cdag; + /** + * @brief The BSP architecture model. + * + * It defines the hardware characteristics including processor types, memory limits, + * communication bandwidth/latency (send costs), and global synchronization costs. + */ BspArchitecture architecture; - // for problem instances with heterogeneity + /** + * @brief Stores the compatibility between node types and processor types. + * + * The architecture defines a type for each processor, and the DAG defines a type for each node. + * This matrix stores for each node type and processor type whether they are compatible, i.e., + * if a node of that type can be assigned to a processor of the given type in a schedule. + * @note The outer vector is indexed by node type, the inner vector is indexed by processor type. + */ std::vector> nodeProcessorCompatibility = std::vector>({{true}}); + /** + * @brief The type of the vectex types in the computational DAG. + * If the DAG does not support vertex types, this is `unsigned`. + */ + using vertex_type_t_or_default = std::conditional_t, v_type_t, unsigned>; + using processor_type_t = unsigned; + public: /** * @brief Default constructor for the BspInstance class. @@ -83,6 +102,7 @@ class BspInstance { /** * @brief Constructs a BspInstance object with the specified computational DAG and BSP architecture. + * Computational DAG and BSP architecture are copied! * * @param cdag The computational DAG for the instance. * @param architecture The BSP architecture for the instance. @@ -93,6 +113,7 @@ class BspInstance { /** * @brief Constructs a BspInstance object with the specified computational DAG and BSP architecture. + * Computational DAG and BSP architecture are moved! * * @param cdag The computational DAG for the instance. * @param architecture The BSP architecture for the instance. @@ -110,191 +131,198 @@ class BspInstance { } BspInstance(const BspInstance &other) = default; - BspInstance(BspInstance &&other) = default; + BspInstance(BspInstance &&other) noexcept = default; BspInstance &operator=(const BspInstance &other) = default; - BspInstance &operator=(BspInstance &&other) = default; + BspInstance &operator=(BspInstance &&other) noexcept = default; /** - * @brief Returns a reference to the BSP architecture for the instance. - * - * @return A reference to the BSP architecture for the instance. + * @brief Returns a reference to the BSP architecture of the instance. + * Assigning the BSP architecture via the reference creates a copy of the architecture. + * The move operator may be used to transfer ownership of the architecture. */ - inline const BspArchitecture &getArchitecture() const { return architecture; } + [[nodiscard]] const BspArchitecture &getArchitecture() const { return architecture; } + [[nodiscard]] BspArchitecture &getArchitecture() { return architecture; } /** - * @brief Returns a reference to the BSP architecture for the instance. - * - * @return A reference to the BSP architecture for the instance. + * @brief Returns a reference to the computational DAG of the instance. + * Assigning the computational DAG via the reference creates a copy of the DAG. + * The move operator may be used to transfer ownership of the DAG. */ - inline BspArchitecture &getArchitecture() { return architecture; } + [[nodiscard]] const Graph_t &getComputationalDag() const { return cdag; } + [[nodiscard]] Graph_t &getComputationalDag() { return cdag; } /** - * @brief Sets the BSP architecture for the instance. - * - * @param architecture_ The BSP architecture for the instance. + * @brief Returns the number of vertices in the computational DAG. */ - inline void setArchitecture(const BspArchitecture &architechture_) { architecture = architechture_; } + [[nodiscard]] vertex_idx_t numberOfVertices() const { return cdag.num_vertices(); } /** - * @brief Returns a reference to the computational DAG for the instance. - * - * @return A reference to the computational DAG for the instance. + * @brief Returns a view over the vertex indices of the computational DAG. */ - inline const Graph_t &getComputationalDag() const { return cdag; } + [[nodiscard]] auto vertices() const { return cdag.vertices(); } /** - * @brief Returns a reference to the computational DAG for the instance. - * - * @return A reference to the computational DAG for the instance. + * @brief Returns a view over the processor indices of the BSP architecture. */ - inline Graph_t &getComputationalDag() { return cdag; } - - inline vertex_idx_t numberOfVertices() const { return cdag.num_vertices(); } - - inline auto vertices() const { return cdag.vertices(); } - - inline auto processors() const { return architecture.processors(); } + [[nodiscard]] auto processors() const { return architecture.processors(); } /** * @brief Returns the number of processors in the BSP architecture. - * - * @return The number of processors in the BSP architecture. */ - inline unsigned numberOfProcessors() const { return architecture.numberOfProcessors(); } + [[nodiscard]] unsigned numberOfProcessors() const { return architecture.numberOfProcessors(); } /** - * @brief Returns the communication costs between two processors. - * + * @brief Returns the communication costs between two processors. Does not perform bounds checking. * The communication costs are the send costs multiplied by the communication costs. * - * @param p1 The index of the first processor. - * @param p2 The index of the second processor. - * - * @return The communication costs between the two processors. + * @param p_send The index of the sending processor. + * @param p_receive The index of the receiving processor. */ - inline v_commw_t communicationCosts(unsigned int p1, unsigned int p2) const { - return architecture.communicationCosts(p1, p2); + [[nodiscard]] v_commw_t communicationCosts(const unsigned p_send, const unsigned p_receive) const { + return architecture.communicationCosts(p_send, p_receive); } /** - * @brief Returns the send costs between two processors. - * - * - * @param p1 The index of the first processor. - * @param p2 The index of the second processor. + * @brief Returns the send costs between two processors. Does not perform bounds checking. + * Does not the communication costs into account. * - * @return The send costs between the two processors. + * @param p_send The index of the sending processor. + * @param p_receive The index of the receiving processor. */ - inline v_commw_t sendCosts(unsigned int p1, unsigned int p2) const { - return architecture.sendCosts(p1, p2); + [[nodiscard]] v_commw_t sendCosts(const unsigned p_send, const unsigned p_receive) const { + return architecture.sendCosts(p_send, p_receive); } /** * @brief Returns a copy of the send costs matrix. - * - * @return A copy of the send costs matrix. */ - inline const std::vector>> &sendCostMatrix() const { - return architecture.sendCostMatrix(); + [[nodiscard]] std::vector>> sendCosts() const { return architecture.sendCosts(); } + + /** + * @brief Returns the flattened send costs vector. + */ + [[nodiscard]] const std::vector> &sendCostsVector() const { + return architecture.sendCostsVector(); } /** * @brief Returns the communication costs of the BSP architecture. - * - * @return The communication costs as an unsigned integer. */ - inline v_commw_t communicationCosts() const { return architecture.communicationCosts(); } + [[nodiscard]] v_commw_t communicationCosts() const { return architecture.communicationCosts(); } /** * @brief Returns the synchronization costs of the BSP architecture. - * - * @return The synchronization costs as an unsigned integer. */ - inline v_commw_t synchronisationCosts() const { return architecture.synchronisationCosts(); } + [[nodiscard]] v_commw_t synchronisationCosts() const { return architecture.synchronisationCosts(); } /** - * @brief Returns whether the architecture is NUMA. - * - * @return True if the architecture is NUMA, false otherwise. + * @brief Returns the memory bound for a specific processor. + * @param proc The processor index. */ - inline bool isNumaInstance() const { return architecture.isNumaArchitecture(); } - - inline v_memw_t memoryBound(unsigned proc) const { return architecture.memoryBound(proc); } - - v_memw_t maxMemoryBoundProcType(unsigned procType) const { - return architecture.maxMemoryBoundProcType(procType); - } - - v_memw_t maxMemoryBoundNodeType(unsigned nodeType) const { - int max_mem = 0; - for (unsigned proc = 0; proc < architecture.getNumberOfProcessorTypes(); proc++) { - if (isCompatibleType(nodeType, architecture.processorType(proc))) { - max_mem = std::max(max_mem, architecture.memoryBound(proc)); - } - } - return max_mem; - } + [[nodiscard]] v_memw_t memoryBound(const unsigned proc) const { return architecture.memoryBound(proc); } /** * @brief Sets the communication costs of the BSP architecture. - * * @param cost The communication costs to set. */ - inline void setCommunicationCosts(const v_commw_t cost) { architecture.setCommunicationCosts(cost); } + void setCommunicationCosts(const v_commw_t cost) { architecture.setCommunicationCosts(cost); } /** * @brief Sets the synchronisation costs of the BSP architecture. - * * @param cost The synchronisation costs to set. */ - inline void setSynchronisationCosts(const v_commw_t cost) { architecture.setSynchronisationCosts(cost); } + void setSynchronisationCosts(const v_commw_t cost) { architecture.setSynchronisationCosts(cost); } + + /** + * @brief Sets the number of processors. Processor type is set to 0 for all processors. + * Resets send costs to uniform (1) and diagonal to 0. The memory bound is set to 100 for all processors. + * @param numberOfProcessors The number of processors. Must be greater than 0. + * @throws std::invalid_argument if the number of processors is 0. + */ + void setNumberOfProcessors(const unsigned num) { architecture.setNumberOfProcessors(num); } + + /** + * @brief Returns the processor type for a given processor index. Does not perform bounds checking. + * @param proc The processor index. + */ + [[nodiscard]] vertex_type_t_or_default processorType(const unsigned proc) const { return architecture.processorType(proc); } /** - * @brief Sets the number of processors in the BSP architecture. + * @brief Checks if a node is compatible with a processor. Does not perform bounds checking. * - * @param num The number of processors to set. + * @param node The node index. + * @param processor_id The processor index. + * @return True if the node is compatible with the processor, false otherwise. */ - inline void setNumberOfProcessors(const unsigned num) { architecture.setNumberOfProcessors(num); } + [[nodiscard]] bool isCompatible(const vertex_idx_t &node, const unsigned processor_id) const { + return isCompatibleType(cdag.vertex_type(node), architecture.processorType(processor_id)); + } - bool check_memory_constraints_feasibility() const { + /** + * @brief Checks if a node type is compatible with a processor type. Does not perform bounds checking. + * + * @param nodeType The node type. + * @param processorType The processor type. + * @return True if the node type is compatible with the processor type, false otherwise. + */ + [[nodiscard]] bool isCompatibleType(const vertex_type_t_or_default nodeType, const processor_type_t processorType) const { + return nodeProcessorCompatibility[nodeType][processorType]; + } - std::vector> max_memory_per_proc_type(architecture.getNumberOfProcessorTypes(), 0); - for (unsigned proc = 0; proc < architecture.numberOfProcessors(); proc++) { - max_memory_per_proc_type[architecture.processorType(proc)] = - std::max(max_memory_per_proc_type[architecture.processorType(proc)], architecture.memoryBound(proc)); - } - for (unsigned vertType = 0; vertType < cdag.num_vertex_types(); vertType++) { - v_memw_t max_memory_of_type = max_memory_weight(vertType, cdag); - bool fits = false; + /** + * @brief Sets the node-processor compatibility matrix. The matrix is copied. Dimensions are not checked. + * @param compatibility_ The compatibility matrix. + */ + void setNodeProcessorCompatibility(const std::vector> &compatibility_) { + nodeProcessorCompatibility = compatibility_; + } - for (unsigned proc_type = 0; proc_type < architecture.getNumberOfProcessorTypes(); proc_type++) { - if (isCompatibleType(vertType, proc_type)) { - fits = fits | (max_memory_of_type <= max_memory_per_proc_type[proc_type]); - if (fits) - break; - } - } + /** + * @brief Returns the node-processor compatibility matrix. + */ + [[nodiscard]] const std::vector> &getNodeProcessorCompatibilityMatrix() const { + return nodeProcessorCompatibility; + } - if (!fits) - return false; - } + /** + * @brief Returns the node type - processor type compatibility matrix. + */ + [[nodiscard]] const std::vector> &getProcessorCompatibilityMatrix() const { return nodeProcessorCompatibility; } - return true; + /** + * @brief Sets the compatibility matrix to be diagonal. This implies that node type `i` is only compatible with processor type `i`. + * @param number_of_types The number of types. + */ + void setDiagonalCompatibilityMatrix(const vertex_type_t_or_default number_of_types) { + nodeProcessorCompatibility.assign(number_of_types, std::vector(number_of_types, false)); + for (vertex_type_t_or_default i = 0; i < number_of_types; ++i) + nodeProcessorCompatibility[i][i] = true; } - void adjust_memory_constraints() { + /** + * @brief Sets the compatibility matrix to all ones. This implies that all node types are compatible with all processor types. + */ + void setAllOnesCompatibilityMatrix() { + nodeProcessorCompatibility.assign(cdag.num_vertex_types(), std::vector(architecture.getNumberOfProcessorTypes(), true)); + } + /** + * @brief Returns false if there is a node whose weight does not fit on any of its compatible processors. + * @return True if the memory constraints are feasible, false otherwise. + */ + [[nodiscard]] bool CheckMemoryConstraintsFeasibility() const { std::vector> max_memory_per_proc_type(architecture.getNumberOfProcessorTypes(), 0); - for (unsigned proc = 0; proc < architecture.numberOfProcessors(); proc++) { + for (unsigned proc = 0U; proc < architecture.numberOfProcessors(); proc++) { max_memory_per_proc_type[architecture.processorType(proc)] = std::max(max_memory_per_proc_type[architecture.processorType(proc)], architecture.memoryBound(proc)); } - for (unsigned vertType = 0; vertType < cdag.num_vertex_types(); vertType++) { + + for (vertex_type_t_or_default vertType = 0U; vertType < cdag.num_vertex_types(); vertType++) { v_memw_t max_memory_of_type = max_memory_weight(vertType, cdag); bool fits = false; - for (unsigned proc_type = 0; proc_type < architecture.getNumberOfProcessorTypes(); proc_type++) { + for (processor_type_t proc_type = 0U; proc_type < architecture.getNumberOfProcessorTypes(); proc_type++) { if (isCompatibleType(vertType, proc_type)) { fits = fits | (max_memory_of_type <= max_memory_per_proc_type[proc_type]); if (fits) @@ -302,140 +330,29 @@ class BspInstance { } } - if (!fits) { - std::cout << "Warning: Computational DAG memory weight exceeds architecture memory bound." << std::endl; - std::cout << "VertexType " << vertType << " has memory " - << " and exceeds compatible processor types memory limit." << std::endl; - - for (unsigned proc = 0; proc < architecture.numberOfProcessors(); proc++) { - if (isCompatibleType(vertType, architecture.processorType(proc))) { - std::cout << "Increasing memory of processor " << proc << " of type " - << architecture.processorType(proc) << " to " << max_memory_of_type << "." - << std::endl; - architecture.setMemoryBound(max_memory_of_type, proc); - } - } - } + if (!fits) + return false; } - } - - inline v_type_t processorType(unsigned p1) const { return architecture.processorType(p1); } - - inline bool isCompatible(const vertex_idx_t &node, unsigned processor_id) const { - return isCompatibleType(cdag.vertex_type(node), architecture.processorType(processor_id)); - } - - inline bool isCompatibleType(v_type_t nodeType, v_type_t processorType) const { - - return nodeProcessorCompatibility[nodeType][processorType]; - } - - void setNodeProcessorCompatibility(const std::vector> &compatibility_) { - - nodeProcessorCompatibility = compatibility_; - } - - const std::vector> &getProcessorCompatibilityMatrix() const { return nodeProcessorCompatibility; } - - void setDiagonalCompatibilityMatrix(unsigned number_of_types) { - nodeProcessorCompatibility = - std::vector>(number_of_types, std::vector(number_of_types, false)); - for (unsigned i = 0; i < number_of_types; ++i) - nodeProcessorCompatibility[i][i] = true; - } - - void setAllOnesCompatibilityMatrix() { - - unsigned number_of_node_types = cdag.num_vertex_types(); - unsigned number_of_proc_types = architecture.getNumberOfProcessorTypes(); - - nodeProcessorCompatibility = - std::vector>(number_of_node_types, std::vector(number_of_proc_types, true)); + return true; } - std::vector> getProcTypesCompatibleWithNodeType() const { - unsigned numberOfNodeTypes = cdag.num_vertex_types(); - unsigned numberOfProcTypes = architecture.getNumberOfProcessorTypes(); - std::vector> compatibleProcTypes(numberOfNodeTypes); + /** + * @brief Returns a list of compatible processor types for each node type. + * @return A vector where the index is the node type and the value is a vector of compatible processor types. + */ + [[nodiscard]] std::vector> getProcTypesCompatibleWithNodeType() const { + vertex_type_t_or_default numberOfNodeTypes = cdag.num_vertex_types(); + processor_type_t numberOfProcTypes = architecture.getNumberOfProcessorTypes(); + std::vector> compatibleProcTypes(numberOfNodeTypes); - for (unsigned nodeType = 0; nodeType < numberOfNodeTypes; ++nodeType) - for (unsigned processorType = 0; processorType < numberOfProcTypes; ++processorType) + for (vertex_type_t_or_default nodeType = 0U; nodeType < numberOfNodeTypes; ++nodeType) + for (processor_type_t processorType = 0U; processorType < numberOfProcTypes; ++processorType) if (isCompatibleType(nodeType, processorType)) compatibleProcTypes[nodeType].push_back(processorType); return compatibleProcTypes; } - - std::vector> getNodeNodeCompatabilityMatrix() const { - std::vector> compMat(cdag.num_vertex_types(), - std::vector(cdag.num_vertex_types(), false)); - for (unsigned nodeType1 = 0; nodeType1 < cdag.num_vertex_types(); nodeType1++) { - for (unsigned nodeType2 = 0; nodeType2 < cdag.num_vertex_types(); nodeType2++) { - for (unsigned procType = 0; procType < architecture.getNumberOfProcessorTypes(); procType++) { - if (isCompatibleType(nodeType1, procType) && isCompatibleType(nodeType2, procType)) { - compMat[nodeType1][nodeType2] = true; - break; - } - } - } - } - return compMat; - } - - inline const std::vector> &getNodeProcessorCompatibilityMatrix() const { - return nodeProcessorCompatibility; - } -}; - -template -class compatible_processor_range { - - std::vector> type_processor_idx; - const BspInstance *instance = nullptr; - - public: - - compatible_processor_range() = default; - - compatible_processor_range(const BspInstance &inst) { - initialize(inst); - } - - inline void initialize(const BspInstance &inst) { - - instance = &inst; - - if constexpr (has_typed_vertices_v) { - - type_processor_idx = std::vector>(inst.getComputationalDag().num_vertex_types()); - - for (v_type_t v_type = 0; v_type < inst.getComputationalDag().num_vertex_types(); v_type++) { - for (unsigned proc = 0; proc < inst.numberOfProcessors(); proc++) - if (inst.isCompatibleType(v_type, inst.processorType(proc))) - type_processor_idx[v_type].push_back(proc); - - } - } - } - - inline const auto & compatible_processors_type(v_type_t type) const { - - assert(instance != nullptr); - - if constexpr (has_typed_vertices_v) { - return type_processor_idx[type]; - } else { - return instance->processors(); - } - } - - inline const auto & compatible_processors_vertex(vertex_idx_t vertex) const { - return compatible_processors_type(instance->getComputationalDag().vertex_type(vertex)); - } - - }; - } // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/model/BspSchedule.hpp b/include/osp/bsp/model/BspSchedule.hpp index eeeaeec3..9e5a5d52 100644 --- a/include/osp/bsp/model/BspSchedule.hpp +++ b/include/osp/bsp/model/BspSchedule.hpp @@ -25,8 +25,8 @@ limitations under the License. #include "IBspSchedule.hpp" #include "IBspScheduleEval.hpp" -#include "SetSchedule.hpp" #include "osp/bsp/model/cost/LazyCommunicationCost.hpp" +#include "osp/bsp/model/util/SetSchedule.hpp" #include "osp/concepts/computational_dag_concept.hpp" namespace osp { @@ -105,9 +105,7 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval, public IBspScheduleEval, public IBspScheduleEval, public IBspScheduleEvalnumberOfVertices()) { node_to_superstep_assignment[node] = superstep; @@ -275,7 +273,7 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval, public IBspScheduleEval, public IBspScheduleEval> getAssignedNodeVector(unsigned int processor) const { + [[nodiscard]] std::vector> getAssignedNodeVector(const unsigned processor) const { std::vector> vec; for (const auto &node : instance->vertices()) { @@ -498,7 +496,7 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval> getAssignedNodeVector(unsigned int processor, unsigned int superstep) const { + [[nodiscard]] std::vector> getAssignedNodeVector(const unsigned processor, const unsigned superstep) const { std::vector> vec; for (const auto &node : instance->vertices()) { @@ -515,7 +513,7 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval, public IBspScheduleEvalvertices()) { @@ -572,11 +570,14 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval comm_phase_empty(number_of_supersteps, true); - for (const auto &node : instance->vertices()) - for (const auto &child : instance->getComputationalDag().children(node)) - if (node_to_processor_assignment[node] != node_to_processor_assignment[child]) + for (const auto &node : instance->vertices()) { + for (const auto &child : instance->getComputationalDag().children(node)) { + if (node_to_processor_assignment[node] != node_to_processor_assignment[child]) { for (unsigned offset = 1; offset <= getStaleness(); ++offset) comm_phase_empty[node_to_superstep_assignment[child] - offset] = false; + } + } + } std::vector new_step_index(number_of_supersteps); unsigned current_index = 0; @@ -585,9 +586,9 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEvalvertices()) + for (const auto &node : instance->vertices()) { node_to_superstep_assignment[node] = new_step_index[node_to_superstep_assignment[node]]; - + } setNumberOfSupersteps(current_index); } @@ -633,7 +634,6 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval> current_proc_transient_memory(instance->numberOfProcessors(), 0); for (const auto &node : instance->vertices()) { - const unsigned proc = node_to_processor_assignment[node]; current_proc_persistent_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node); current_proc_transient_memory[proc] = std::max( @@ -659,7 +659,6 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEval> current_proc_memory(instance->numberOfProcessors(), 0); for (const auto &node : instance->vertices()) { - const unsigned proc = node_to_processor_assignment[node]; current_proc_memory[proc] += instance->getComputationalDag().vertex_mem_weight(node); @@ -671,12 +670,10 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEvalnumberOfProcessors(); proc++) { - v_memw_t memory = 0; for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { memory += instance->getComputationalDag().vertex_mem_weight(node) + @@ -701,12 +698,10 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEvalnumberOfProcessors(); proc++) { - std::unordered_set> nodes_with_incoming_edges; v_memw_t memory = 0; @@ -714,7 +709,6 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEvalgetComputationalDag().vertex_comm_weight(node); for (const auto &parent : instance->getComputationalDag().parents(node)) { - if (node_to_superstep_assignment[parent] != step) { nodes_with_incoming_edges.insert(parent); } @@ -734,23 +728,19 @@ class BspSchedule : public IBspSchedule, public IBspScheduleEvalnumberOfProcessors(); proc++) { - std::unordered_set> nodes_with_incoming_edges; v_memw_t memory = 0; for (const auto &node : set_schedule.step_processor_vertices[step][proc]) { - if (is_source(node, instance->getComputationalDag())) { memory += instance->getComputationalDag().vertex_mem_weight(node); } for (const auto &parent : instance->getComputationalDag().parents(node)) { - if (node_to_superstep_assignment[parent] != step) { nodes_with_incoming_edges.insert(parent); } diff --git a/include/osp/bsp/model/util/CompatibleProcessorRange.hpp b/include/osp/bsp/model/util/CompatibleProcessorRange.hpp new file mode 100644 index 00000000..c4d8df30 --- /dev/null +++ b/include/osp/bsp/model/util/CompatibleProcessorRange.hpp @@ -0,0 +1,101 @@ +/* +Copyright 2024 Huawei Technologies Co., Ltd. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +*/ + +#pragma once + +#include "osp/bsp/model/BspInstance.hpp" +#include + +namespace osp { + +/** + * @class CompatibleProcessorRange + * @brief Helper class to efficiently iterate over compatible processors for a given node or node type. + * + * This class precomputes and stores the list of compatible processors for each node type. + * + * @tparam Graph_t The type of the computational DAG. + */ +template +class CompatibleProcessorRange { + + std::vector> typeProcessorIdx; + const BspInstance *instance = nullptr; + + public: + /** + * @brief Default constructor. + */ + CompatibleProcessorRange() = default; + + /** + * @brief Constructs a CompatibleProcessorRange for the given BspInstance. + * + * @param inst The BspInstance. + */ + CompatibleProcessorRange(const BspInstance &inst) { + initialize(inst); + } + + /** + * @brief Initializes the CompatibleProcessorRange with a BspInstance. + * + * @param inst The BspInstance. + */ + void initialize(const BspInstance &inst) { + instance = &inst; + + if constexpr (has_typed_vertices_v) { + typeProcessorIdx.resize(inst.getComputationalDag().num_vertex_types()); + + for (v_type_t v_type = 0; v_type < inst.getComputationalDag().num_vertex_types(); v_type++) { + for (unsigned proc = 0; proc < inst.numberOfProcessors(); proc++) + if (inst.isCompatibleType(v_type, inst.processorType(proc))) + typeProcessorIdx[v_type].push_back(proc); + } + } + } + + /** + * @brief Returns a range of compatible processors for a given node type. + * + * @param type The node type. + * @return A const reference to a vector of compatible processor indices. + */ + [[nodiscard]] const auto &compatible_processors_type(const v_type_t type) const { + assert(instance != nullptr); + if constexpr (has_typed_vertices_v) { + return typeProcessorIdx[type]; + } else { + return instance->processors(); + } + } + + /** + * @brief Returns a range of compatible processors for a given vertex. + * + * @param vertex The vertex index. + * @return A const reference to a vector of compatible processor indices. + */ + [[nodiscard]] const auto &compatible_processors_vertex(const vertex_idx_t vertex) const { + assert(instance != nullptr); + return compatible_processors_type(instance->getComputationalDag().vertex_type(vertex)); + } +}; + +} // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/model/SetSchedule.hpp b/include/osp/bsp/model/util/SetSchedule.hpp similarity index 99% rename from include/osp/bsp/model/SetSchedule.hpp rename to include/osp/bsp/model/util/SetSchedule.hpp index da851f98..61946fae 100644 --- a/include/osp/bsp/model/SetSchedule.hpp +++ b/include/osp/bsp/model/util/SetSchedule.hpp @@ -18,7 +18,7 @@ limitations under the License. #pragma once -#include "IBspSchedule.hpp" +#include "osp/bsp/model/IBspSchedule.hpp" #include "osp/concepts/computational_dag_concept.hpp" namespace osp { diff --git a/include/osp/bsp/model/VectorSchedule.hpp b/include/osp/bsp/model/util/VectorSchedule.hpp similarity index 99% rename from include/osp/bsp/model/VectorSchedule.hpp rename to include/osp/bsp/model/util/VectorSchedule.hpp index a81cc3e5..ea856c1b 100644 --- a/include/osp/bsp/model/VectorSchedule.hpp +++ b/include/osp/bsp/model/util/VectorSchedule.hpp @@ -18,7 +18,7 @@ limitations under the License. #pragma once -#include "IBspSchedule.hpp" +#include "osp/bsp/model/IBspSchedule.hpp" #include "osp/concepts/computational_dag_concept.hpp" #include diff --git a/include/osp/bsp/scheduler/CoarseAndSchedule.hpp b/include/osp/bsp/scheduler/CoarseAndSchedule.hpp index 0e9df967..2e23c22e 100644 --- a/include/osp/bsp/scheduler/CoarseAndSchedule.hpp +++ b/include/osp/bsp/scheduler/CoarseAndSchedule.hpp @@ -42,17 +42,17 @@ class CoarseAndSchedule : public Scheduler { const auto &instance = schedule.getInstance(); BspInstance instance_coarse; - + std::vector> reverse_vertex_map; bool status = coarser.coarsenDag(instance.getComputationalDag(), instance_coarse.getComputationalDag(), - reverse_vertex_map); + reverse_vertex_map); if (!status) { return RETURN_STATUS::ERROR; - } + } - instance_coarse.setArchitecture(instance.getArchitecture()); + instance_coarse.getArchitecture() = instance.getArchitecture(); instance_coarse.setNodeProcessorCompatibility(instance.getProcessorCompatibilityMatrix()); BspSchedule schedule_coarse(instance_coarse); diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp index 38fae9ff..b5b4ea95 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp @@ -39,6 +39,8 @@ limitations under the License. namespace osp { +static constexpr unsigned CacheLineSize = 64; + template struct GrowLocalAutoCoresParallel_Params { vert_t minSuperstepSize = 20; diff --git a/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp b/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp index aa199c45..45b58ca3 100644 --- a/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp +++ b/include/osp/bsp/scheduler/IlpSchedulers/CoptFullScheduler.hpp @@ -21,14 +21,14 @@ limitations under the License. #include #include +#include "osp/auxiliary/io/DotFileWriter.hpp" #include "osp/bsp/model/BspSchedule.hpp" #include "osp/bsp/model/BspScheduleCS.hpp" #include "osp/bsp/model/BspScheduleRecomp.hpp" #include "osp/bsp/model/MaxBspSchedule.hpp" #include "osp/bsp/model/MaxBspScheduleCS.hpp" -#include "osp/bsp/model/VectorSchedule.hpp" +#include "osp/bsp/model/util/VectorSchedule.hpp" #include "osp/bsp/scheduler/Scheduler.hpp" -#include "osp/auxiliary/io/DotFileWriter.hpp" namespace osp { @@ -111,17 +111,19 @@ class CoptFullScheduler : public Scheduler { if (allow_recomputation_cb) { - auto sched = constructBspScheduleRecompFromCallback(); - DotFileWriter sched_writer; - sched_writer.write_schedule_recomp(write_solutions_path_cb + "intmed_sol_" + solution_file_prefix_cb + "_" + - std::to_string(counter) + "_schedule.dot", sched); + auto sched = constructBspScheduleRecompFromCallback(); + DotFileWriter sched_writer; + sched_writer.write_schedule_recomp(write_solutions_path_cb + "intmed_sol_" + solution_file_prefix_cb + "_" + + std::to_string(counter) + "_schedule.dot", + sched); } else { - BspSchedule sched = constructBspScheduleFromCallback(); - DotFileWriter sched_writer; - sched_writer.write_schedule(write_solutions_path_cb + "intmed_sol_" + solution_file_prefix_cb + "_" + - std::to_string(counter) + "_schedule.dot", sched); + BspSchedule sched = constructBspScheduleFromCallback(); + DotFileWriter sched_writer; + sched_writer.write_schedule(write_solutions_path_cb + "intmed_sol_" + solution_file_prefix_cb + "_" + + std::to_string(counter) + "_schedule.dot", + sched); } counter++; } @@ -259,7 +261,7 @@ class CoptFullScheduler : public Scheduler { } } - if(is_max_bsp && number_of_supersteps>0) // can ignore last 2 comm phases in this case + if (is_max_bsp && number_of_supersteps > 0) // can ignore last 2 comm phases in this case --number_of_supersteps; schedule.getCommunicationSchedule().clear(); @@ -268,7 +270,7 @@ class CoptFullScheduler : public Scheduler { for (unsigned int p_from = 0; p_from < instance.numberOfProcessors(); p_from++) { for (unsigned int p_to = 0; p_to < instance.numberOfProcessors(); p_to++) { if (p_from != p_to) { - for (unsigned int step = 0; step < number_of_supersteps-1; step++) { + for (unsigned int step = 0; step < number_of_supersteps - 1; step++) { if (comm_processor_to_processor_superstep_node_var[p_from][p_to][step] [static_cast(node)] .Get(COPT_DBLINFO_VALUE) >= .99) { @@ -302,7 +304,7 @@ class CoptFullScheduler : public Scheduler { for (unsigned processor = 0; processor < schedule.getInstance().numberOfProcessors(); processor++) { - for (unsigned step = 0; step < number_of_supersteps-1; step++) { + for (unsigned step = 0; step < number_of_supersteps - 1; step++) { if (node_to_processor_superstep_var[node][processor][static_cast(step)].Get(COPT_DBLINFO_VALUE) >= .99) { schedule.assignments(node).emplace_back(processor, step); @@ -334,46 +336,35 @@ class CoptFullScheduler : public Scheduler { } } - void loadInitialSchedule(Model &model, const BspInstance &instance) { if (use_initial_schedule_recomp && (max_number_supersteps < initial_schedule_recomp->numberOfSupersteps() || - instance.numberOfProcessors() != initial_schedule_recomp->getInstance().numberOfProcessors() || - instance.numberOfVertices() != initial_schedule_recomp->getInstance().numberOfVertices())) { + instance.numberOfProcessors() != initial_schedule_recomp->getInstance().numberOfProcessors() || + instance.numberOfVertices() != initial_schedule_recomp->getInstance().numberOfVertices())) { throw std::invalid_argument("Invalid Argument while computeScheduleRecomp[Recomp]: instance parameters do not " "agree with those of the initial schedule's instance!"); } if (!use_initial_schedule_recomp & use_initial_schedule && (max_number_supersteps < initial_schedule->numberOfSupersteps() || - instance.numberOfProcessors() != initial_schedule->getInstance().numberOfProcessors() || - instance.numberOfVertices() != initial_schedule->getInstance().numberOfVertices())) { + instance.numberOfProcessors() != initial_schedule->getInstance().numberOfProcessors() || + instance.numberOfVertices() != initial_schedule->getInstance().numberOfVertices())) { throw std::invalid_argument("Invalid Argument while computeScheduleRecomp[Recomp]: instance parameters do not " "agree with those of the initial schedule's instance!"); } - const auto& DAG = use_initial_schedule_recomp ? - initial_schedule_recomp->getInstance().getComputationalDag() : - initial_schedule->getInstance().getComputationalDag(); + const auto &DAG = use_initial_schedule_recomp ? initial_schedule_recomp->getInstance().getComputationalDag() : initial_schedule->getInstance().getComputationalDag(); - const auto& arch = use_initial_schedule_recomp ? - initial_schedule_recomp->getInstance().getArchitecture() : - initial_schedule->getInstance().getArchitecture(); + const auto &arch = use_initial_schedule_recomp ? initial_schedule_recomp->getInstance().getArchitecture() : initial_schedule->getInstance().getArchitecture(); - const unsigned& num_processors = use_initial_schedule_recomp ? - initial_schedule_recomp->getInstance().numberOfProcessors() : - initial_schedule->getInstance().numberOfProcessors(); + const unsigned &num_processors = use_initial_schedule_recomp ? initial_schedule_recomp->getInstance().numberOfProcessors() : initial_schedule->getInstance().numberOfProcessors(); - const unsigned& num_supersteps = use_initial_schedule_recomp ? - initial_schedule_recomp->numberOfSupersteps() : - initial_schedule->numberOfSupersteps(); + const unsigned &num_supersteps = use_initial_schedule_recomp ? initial_schedule_recomp->numberOfSupersteps() : initial_schedule->numberOfSupersteps(); - const auto &cs = use_initial_schedule_recomp ? - initial_schedule_recomp->getCommunicationSchedule() : - initial_schedule->getCommunicationSchedule(); + const auto &cs = use_initial_schedule_recomp ? initial_schedule_recomp->getCommunicationSchedule() : initial_schedule->getCommunicationSchedule(); - assert(max_number_supersteps <= static_cast( std::numeric_limits::max()) ); + assert(max_number_supersteps <= static_cast(std::numeric_limits::max())); for (unsigned step = 0; step < max_number_supersteps; step++) { if (step < num_supersteps) { @@ -387,28 +378,23 @@ class CoptFullScheduler : public Scheduler { // model.SetMipStart(max_comm_superstep_var[step], COPT_INFINITY); } - std::vector > > computed(DAG.num_vertices()); - for (const auto &node : DAG.vertices()) - { - if(use_initial_schedule_recomp) - for (const std::pair& assignment : initial_schedule_recomp->assignments(node)) + std::vector>> computed(DAG.num_vertices()); + for (const auto &node : DAG.vertices()) { + if (use_initial_schedule_recomp) + for (const std::pair &assignment : initial_schedule_recomp->assignments(node)) computed[node].emplace(assignment); else - computed[node].emplace(initial_schedule->assignedProcessor(node),initial_schedule->assignedSuperstep(node)); + computed[node].emplace(initial_schedule->assignedProcessor(node), initial_schedule->assignedSuperstep(node)); } - std::vector > first_at(DAG.num_vertices(), std::vector(num_processors, std::numeric_limits::max())); - for (const auto &node : DAG.vertices()) - { - if(use_initial_schedule_recomp) - { - for (const std::pair& assignment : initial_schedule_recomp->assignments(node)) + std::vector> first_at(DAG.num_vertices(), std::vector(num_processors, std::numeric_limits::max())); + for (const auto &node : DAG.vertices()) { + if (use_initial_schedule_recomp) { + for (const std::pair &assignment : initial_schedule_recomp->assignments(node)) first_at[node][assignment.first] = std::min(first_at[node][assignment.first], assignment.second); - } - else - { + } else { first_at[node][initial_schedule->assignedProcessor(node)] = std::min(first_at[node][initial_schedule->assignedProcessor(node)], - initial_schedule->assignedSuperstep(node) ); + initial_schedule->assignedSuperstep(node)); } } @@ -431,7 +417,7 @@ class CoptFullScheduler : public Scheduler { comm_processor_to_processor_superstep_node_var[p1][p2][step] [static_cast(node)], 1); - first_at[node][p2] = std::min(first_at[node][p2], step+staleness); + first_at[node][p2] = std::min(first_at[node][p2], step + staleness); } else { model.SetMipStart( comm_processor_to_processor_superstep_node_var[p1][p2][step] @@ -447,14 +433,15 @@ class CoptFullScheduler : public Scheduler { for (const auto &node : DAG.vertices()) for (unsigned proc = 0; proc < num_processors; proc++) - for(unsigned step = 0; step < max_number_supersteps; step++) - { - if(step >= first_at[node][proc]) + for (unsigned step = 0; step < max_number_supersteps; step++) { + if (step >= first_at[node][proc]) model.SetMipStart(comm_processor_to_processor_superstep_node_var[proc][proc][step] - [static_cast(node)], 1); + [static_cast(node)], + 1); else model.SetMipStart(comm_processor_to_processor_superstep_node_var[proc][proc][step] - [static_cast(node)], 0); + [static_cast(node)], + 0); } for (const auto &node : DAG.vertices()) { @@ -478,16 +465,13 @@ class CoptFullScheduler : public Scheduler { max_number_supersteps, std::vector>(num_processors, 0)); - if(use_initial_schedule_recomp) - { + if (use_initial_schedule_recomp) { for (const auto &node : initial_schedule_recomp->getInstance().vertices()) { - for (const std::pair& assignment : initial_schedule_recomp->assignments(node)) { + for (const std::pair &assignment : initial_schedule_recomp->assignments(node)) { work[assignment.second][assignment.first] += DAG.vertex_work_weight(node); } } - } - else - { + } else { for (const auto &node : initial_schedule->getInstance().vertices()) work[initial_schedule->assignedSuperstep(node)][initial_schedule->assignedProcessor(node)] += DAG.vertex_work_weight(node); @@ -544,15 +528,14 @@ class CoptFullScheduler : public Scheduler { Variables */ - assert(max_number_supersteps <= static_cast( std::numeric_limits::max() )); - assert(instance.numberOfProcessors() <= static_cast( std::numeric_limits::max()) ); + assert(max_number_supersteps <= static_cast(std::numeric_limits::max())); + assert(instance.numberOfProcessors() <= static_cast(std::numeric_limits::max())); // variables indicating if superstep is used at all superstep_used_var = model.AddVars(static_cast(max_number_supersteps), COPT_BINARY, "superstep_used"); VarArray superstep_has_comm, mergeable_superstep_penalty; - if(is_max_bsp) - { + if (is_max_bsp) { // variables indicating if there is any communication in superstep superstep_has_comm = model.AddVars(static_cast(max_number_supersteps), COPT_BINARY, "superstep_has_comm"); // variables that incentivize the schedule to be continuous - needs to be done differently for maxBsp @@ -676,13 +659,12 @@ class CoptFullScheduler : public Scheduler { if (step > 0) { for (unsigned int p_from = 0; p_from < instance.numberOfProcessors(); p_from++) { - if(!is_max_bsp || p_from == processor){ + if (!is_max_bsp || p_from == processor) { expr1 += comm_processor_to_processor_superstep_node_var[p_from][processor][step - 1] - [static_cast(node)]; - } - else if(step > 1){ + [static_cast(node)]; + } else if (step > 1) { expr1 += comm_processor_to_processor_superstep_node_var[p_from][processor][step - 2] - [static_cast(node)]; + [static_cast(node)]; } } } @@ -700,26 +682,25 @@ class CoptFullScheduler : public Scheduler { } // synchronization cost calculation & forcing continuous schedule in maxBsp - if(is_max_bsp) - { + if (is_max_bsp) { for (unsigned int step = 0; step < max_number_supersteps; step++) { Expr expr; for (const auto &node : instance.vertices()) { for (unsigned int p_from = 0; p_from < instance.numberOfProcessors(); p_from++) { for (unsigned int p_to = 0; p_to < instance.numberOfProcessors(); p_to++) { - if(p_from != p_to) + if (p_from != p_to) expr += comm_processor_to_processor_superstep_node_var[p_from][p_to][step][static_cast(node)]; } } } model.AddConstr(static_cast(instance.numberOfProcessors() * instance.numberOfProcessors() * instance.numberOfVertices()) * - superstep_has_comm[static_cast(step)] >= expr); + superstep_has_comm[static_cast(step)] >= + expr); } // if step i and (i+1) has no comm, and (i+2) has work, then (i+1) and (i+2) are mergeable -> penalize for (unsigned int step = 0; step < max_number_supersteps - 2; step++) - model.AddConstr(superstep_used_var[static_cast(step + 2)] - superstep_has_comm[static_cast(step)] - - superstep_has_comm[static_cast(step + 1)] <= mergeable_superstep_penalty[static_cast(step)]); + model.AddConstr(superstep_used_var[static_cast(step + 2)] - superstep_has_comm[static_cast(step)] - superstep_has_comm[static_cast(step + 1)] <= mergeable_superstep_penalty[static_cast(step)]); } max_comm_superstep_var = @@ -784,7 +765,7 @@ class CoptFullScheduler : public Scheduler { // vertex type restrictions for (const vertex_idx_t &node : instance.vertices()) { for (unsigned int processor = 0; processor < instance.numberOfProcessors(); processor++) { - if(!instance.isCompatible(node, processor)) { + if (!instance.isCompatible(node, processor)) { for (unsigned int step = 0; step < max_number_supersteps; step++) { model.AddConstr(node_to_processor_superstep_var[node][processor][static_cast(step)] == 0); } @@ -797,20 +778,17 @@ class CoptFullScheduler : public Scheduler { */ Expr expr; - if(is_max_bsp) - { + if (is_max_bsp) { VarArray max_superstep_var = model.AddVars(static_cast(max_number_supersteps), COPT_INTEGER, "max_superstep"); for (unsigned int step = 0; step < max_number_supersteps; step++) { model.AddConstr(max_superstep_var[static_cast(step)] >= max_work_superstep_var[static_cast(step)]); - if(step > 0) - model.AddConstr(max_superstep_var[static_cast(step)] >= instance.communicationCosts() * max_comm_superstep_var[static_cast(step-1)]); + if (step > 0) + model.AddConstr(max_superstep_var[static_cast(step)] >= instance.communicationCosts() * max_comm_superstep_var[static_cast(step - 1)]); expr += max_superstep_var[static_cast(step)]; expr += instance.synchronisationCosts() * superstep_has_comm[static_cast(step)]; expr += instance.synchronisationCosts() * mergeable_superstep_penalty[static_cast(step)]; } - } - else - { + } else { for (unsigned int step = 0; step < max_number_supersteps; step++) { expr += max_work_superstep_var[static_cast(step)] + instance.communicationCosts() * max_comm_superstep_var[static_cast(step)] + @@ -877,7 +855,7 @@ class CoptFullScheduler : public Scheduler { // solution_callback.node_to_processor_superstep_var_ptr = &node_to_processor_superstep_var; } - CoptFullScheduler(const BspScheduleRecomp &schedule) + CoptFullScheduler(const BspScheduleRecomp &schedule) : allow_recomputation(true), use_memory_constraint(false), use_initial_schedule_recomp(true), write_solutions_found(false), initial_schedule_recomp(&schedule), max_number_supersteps(schedule.numberOfSupersteps()) { @@ -931,7 +909,6 @@ class CoptFullScheduler : public Scheduler { return run_scheduler(schedule); } - virtual RETURN_STATUS computeScheduleCS(BspScheduleCS &schedule) override { allow_recomputation = false; is_max_bsp = false; @@ -1010,7 +987,6 @@ class CoptFullScheduler : public Scheduler { model.Solve(); } - /** * @brief Sets the provided schedule as the initial solution for the ILP. * diff --git a/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp b/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp index 5d759687..c051c8dc 100644 --- a/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp +++ b/include/osp/bsp/scheduler/IlpSchedulers/TotalCommunicationScheduler.hpp @@ -240,7 +240,7 @@ class TotalCommunicationScheduler : public Scheduler { SetSolution((*max_work_superstep_var_ptr)[static_cast(step)], max_work); } - if (instance_ptr->isNumaInstance()) { + if (instance_ptr->getArchitecture().isNumaArchitecture()) { for (unsigned p1 = 0; p1 < instance_ptr->numberOfProcessors(); p1++) { for (unsigned p2 = 0; p2 < instance_ptr->numberOfProcessors(); p2++) { @@ -670,7 +670,6 @@ class TotalCommunicationScheduler : public Scheduler { loadInitialSchedule(); } - model.SetIntParam(COPT_INTPARAM_THREADS, 128); model.SetIntParam(COPT_INTPARAM_STRONGBRANCHING, 1); model.SetIntParam(COPT_INTPARAM_LPMETHOD, 1); diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin/kl_current_schedule.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin/kl_current_schedule.hpp index af5bfd19..1c544fd1 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin/kl_current_schedule.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin/kl_current_schedule.hpp @@ -16,12 +16,12 @@ limitations under the License. @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner */ -//#define KL_DEBUG +// #define KL_DEBUG #include "osp/bsp/model/BspSchedule.hpp" #include "osp/bsp/model/IBspSchedule.hpp" -#include "osp/bsp/model/SetSchedule.hpp" -#include "osp/bsp/model/VectorSchedule.hpp" +#include "osp/bsp/model/util/SetSchedule.hpp" +#include "osp/bsp/model/util/VectorSchedule.hpp" #include "osp/bsp/scheduler/ImprovementScheduler.hpp" #include "osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp" #include "osp/graph_algorithms/directed_graph_util.hpp" @@ -73,17 +73,15 @@ class kl_current_schedule { using EdgeType = edge_desc_t; public: - kl_current_schedule(Ikl_cost_function *cost_f_) : cost_f(cost_f_) { -#ifdef KL_DEBUG +#ifdef KL_DEBUG if constexpr (use_memory_constraint) { std::cout << "KLCurrentSchedule constructor with memory constraint" << std::endl; } else { std::cout << "KLCurrentSchedule constructor without memory constraint" << std::endl; } #endif - } virtual ~kl_current_schedule() = default; @@ -358,7 +356,7 @@ class kl_current_schedule { if constexpr (use_memory_constraint) { memory_constraint.apply_move(move.node, move.from_proc, move.from_step, move.to_proc, move.to_step); - } + } } virtual void initialize_current_schedule(const IBspSchedule &schedule) { diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp index f6c425bd..2cf0c631 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp @@ -97,7 +97,7 @@ struct kl_bsp_comm_cost_function { constexpr static bool is_max_comm_cost_function = true; kl_active_schedule *active_schedule; - compatible_processor_range *proc_range; + CompatibleProcessorRange *proc_range; const Graph_t *graph; const BspInstance *instance; @@ -119,7 +119,7 @@ struct kl_bsp_comm_cost_function { } void initialize(kl_active_schedule &sched, - compatible_processor_range &p_range) { + CompatibleProcessorRange &p_range) { active_schedule = &sched; proc_range = &p_range; instance = &sched.getInstance(); diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp index 50384c72..caaad9ca 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp @@ -24,24 +24,24 @@ limitations under the License. namespace osp { -template +template struct kl_hyper_total_comm_cost_function { - + using VertexType = vertex_idx_t; using kl_move = kl_move_struct; using kl_gain_update_info = kl_update_info; - + constexpr static unsigned window_range = 2 * window_size + 1; constexpr static bool is_max_comm_cost_function = false; kl_active_schedule *active_schedule; - compatible_processor_range *proc_range; + CompatibleProcessorRange *proc_range; const Graph_t *graph; const BspInstance *instance; - cost_t comm_multiplier = 1; + cost_t comm_multiplier = 1; cost_t max_comm_weight = 0; lambda_vector_container node_lambda_map; @@ -52,20 +52,20 @@ struct kl_hyper_total_comm_cost_function { const std::string name() const { return "toal_comm_cost"; } inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); } - void initialize(kl_active_schedule &sched, compatible_processor_range &p_range) { + void initialize(kl_active_schedule &sched, CompatibleProcessorRange &p_range) { active_schedule = &sched; proc_range = &p_range; instance = &sched.getInstance(); graph = &instance->getComputationalDag(); - comm_multiplier = 1.0 / instance->numberOfProcessors(); - node_lambda_map.initialize(graph->num_vertices(), instance->numberOfProcessors()); + comm_multiplier = 1.0 / instance->numberOfProcessors(); + node_lambda_map.initialize(graph->num_vertices(), instance->numberOfProcessors()); } struct empty_struct {}; using pre_move_comm_data_t = empty_struct; - inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); } + inline empty_struct get_pre_move_comm_data(const kl_move &) { return empty_struct(); } cost_t compute_schedule_cost() { cost_t work_costs = 0; @@ -74,7 +74,7 @@ struct kl_hyper_total_comm_cost_function { } cost_t comm_costs = 0; - for(const auto vertex : graph->vertices()) { + for (const auto vertex : graph->vertices()) { const unsigned vertex_proc = active_schedule->assigned_processor(vertex); const cost_t v_comm_cost = graph->vertex_comm_weight(vertex); max_comm_weight = std::max(max_comm_weight, v_comm_cost); @@ -87,7 +87,7 @@ struct kl_hyper_total_comm_cost_function { if (node_lambda_map.increase_proc_count(vertex, target_proc)) { comm_costs += v_comm_cost * instance->communicationCosts(vertex_proc, target_proc); // is 0 if target_proc == vertex_proc } - } + } } return work_costs + comm_costs * comm_multiplier + static_cast>(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); @@ -100,132 +100,132 @@ struct kl_hyper_total_comm_cost_function { } cost_t comm_costs = 0; - for(const auto vertex : graph->vertices()) { + for (const auto vertex : graph->vertices()) { const unsigned vertex_proc = active_schedule->assigned_processor(vertex); const cost_t v_comm_cost = graph->vertex_comm_weight(vertex); for (const auto lambdaproc_mult_pair : node_lambda_map.iterate_proc_entries(vertex)) { const auto &lambda_proc = lambdaproc_mult_pair.first; comm_costs += v_comm_cost * instance->communicationCosts(vertex_proc, lambda_proc); - } + } } return work_costs + comm_costs * comm_multiplier + static_cast>(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); } - inline void update_datastructure_after_move(const kl_move & move, const unsigned start_step, const unsigned end_step) { - if (move.to_proc != move.from_proc) { + inline void update_datastructure_after_move(const kl_move &move, const unsigned start_step, const unsigned end_step) { + if (move.to_proc != move.from_proc) { for (const auto &source : instance->getComputationalDag().parents(move.node)) { const unsigned source_step = active_schedule->assigned_superstep(source); if (source_step < start_step || source_step > end_step) continue; - update_source_after_move(move, source); + update_source_after_move(move, source); } } } - inline void update_source_after_move(const kl_move & move, VertexType source) { + inline void update_source_after_move(const kl_move &move, VertexType source) { node_lambda_map.decrease_proc_count(source, move.from_proc); node_lambda_map.increase_proc_count(source, move.to_proc); } template - void update_node_comm_affinity(const kl_move &move, thread_data_t& thread_data, const cost_t& penalty, const cost_t& reward, std::map & max_gain_recompute, std::vector &new_nodes) { - + void update_node_comm_affinity(const kl_move &move, thread_data_t &thread_data, const cost_t &penalty, const cost_t &reward, std::map &max_gain_recompute, std::vector &new_nodes) { + const unsigned start_step = thread_data.start_step; const unsigned end_step = thread_data.end_step; - + for (const auto &target : instance->getComputationalDag().children(move.node)) { - const unsigned target_step = active_schedule->assigned_superstep(target); + const unsigned target_step = active_schedule->assigned_superstep(target); if (target_step < start_step || target_step > end_step) continue; - if(thread_data.lock_manager.is_locked(target)) + if (thread_data.lock_manager.is_locked(target)) continue; if (not thread_data.affinity_table.is_selected(target)) { - new_nodes.push_back(target); + new_nodes.push_back(target); continue; } if (max_gain_recompute.find(target) != max_gain_recompute.end()) { - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } + } const unsigned target_proc = active_schedule->assigned_processor(target); - const unsigned target_start_idx = start_idx(target_step, start_step); + const unsigned target_start_idx = start_idx(target_step, start_step); auto &affinity_table = thread_data.affinity_table.at(target); if (move.from_step < target_step + (move.from_proc == target_proc)) { - const unsigned diff = target_step - move.from_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; - unsigned idx = target_start_idx; + const unsigned diff = target_step - move.from_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; + unsigned idx = target_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table[p][idx] -= penalty; - } - } + } + } if (idx - 1 < bound && is_compatible(target, move.from_proc)) { - affinity_table[move.from_proc][idx - 1] += penalty; + affinity_table[move.from_proc][idx - 1] += penalty; } } else { const unsigned diff = move.from_step - target_step; - const unsigned window_bound = end_idx(target_step, end_step); - unsigned idx = std::min(window_size + diff, window_bound); - - if (idx < window_bound && is_compatible(target, move.from_proc)) { - affinity_table[move.from_proc][idx] += reward; + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + + if (idx < window_bound && is_compatible(target, move.from_proc)) { + affinity_table[move.from_proc][idx] += reward; } idx++; - + for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table[p][idx] += reward; - } - } + } + } } if (move.to_step < target_step + (move.to_proc == target_proc)) { - unsigned idx = target_start_idx; - const unsigned diff = target_step - move.to_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; + unsigned idx = target_start_idx; + const unsigned diff = target_step - move.to_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table[p][idx] += penalty; - } - } + } + } if (idx - 1 < bound && is_compatible(target, move.to_proc)) { - affinity_table[move.to_proc][idx - 1] -= penalty; + affinity_table[move.to_proc][idx - 1] -= penalty; } } else { const unsigned diff = move.to_step - target_step; - const unsigned window_bound = end_idx(target_step, end_step); - unsigned idx = std::min(window_size + diff, window_bound); - + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + if (idx < window_bound && is_compatible(target, move.to_proc)) { - affinity_table[move.to_proc][idx] -= reward; + affinity_table[move.to_proc][idx] -= reward; } idx++; - + for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table[p][idx] -= reward; - } - } + } + } } - if (move.to_proc != move.from_proc) { + if (move.to_proc != move.from_proc) { const cost_t comm_gain = graph->vertex_comm_weight(move.node) * comm_multiplier; - + const unsigned window_bound = end_idx(target_step, end_step); - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { if (p == target_proc) continue; if (node_lambda_map.get_proc_entry(move.node, target_proc) == 1) { @@ -233,144 +233,143 @@ struct kl_hyper_total_comm_cost_function { const cost_t x = instance->communicationCosts(move.from_proc, target_proc) * comm_gain; const cost_t y = instance->communicationCosts(move.to_proc, target_proc) * comm_gain; affinity_table[p][idx] += x - y; - } + } } if (node_lambda_map.has_no_proc_entry(move.node, p)) { for (unsigned idx = target_start_idx; idx < window_bound; idx++) { const cost_t x = instance->communicationCosts(move.from_proc, p) * comm_gain; const cost_t y = instance->communicationCosts(move.to_proc, p) * comm_gain; - affinity_table[p][idx] -= x - y; + affinity_table[p][idx] -= x - y; } - } + } } - } + } } - for (const auto &source : instance->getComputationalDag().parents(move.node)) { + for (const auto &source : instance->getComputationalDag().parents(move.node)) { if (move.to_proc != move.from_proc) { - const unsigned source_proc = active_schedule->assigned_processor(source); - if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) { + const unsigned source_proc = active_schedule->assigned_processor(source); + if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; for (const auto &target : instance->getComputationalDag().children(source)) { const unsigned target_step = active_schedule->assigned_superstep(target); - if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) - continue; + if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) + continue; - if (source_proc != move.from_proc && is_compatible(target, move.from_proc)) { + if (source_proc != move.from_proc && is_compatible(target, move.from_proc)) { if (max_gain_recompute.find(target) != max_gain_recompute.end()) { // todo more specialized update - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } + } - auto & affinity_table_target_from_proc = thread_data.affinity_table.at(target)[move.from_proc]; + auto &affinity_table_target_from_proc = thread_data.affinity_table.at(target)[move.from_proc]; const unsigned target_window_bound = end_idx(target_step, end_step); const cost_t comm_aff = instance->communicationCosts(source_proc, move.from_proc) * comm_gain; for (unsigned idx = start_idx(target_step, start_step); idx < target_window_bound; idx++) { affinity_table_target_from_proc[idx] += comm_aff; } } - } - } else if (node_lambda_map.get_proc_entry(source, move.from_proc) == 1) { + } + } else if (node_lambda_map.get_proc_entry(source, move.from_proc) == 1) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; for (const auto &target : instance->getComputationalDag().children(source)) { const unsigned target_step = active_schedule->assigned_superstep(target); - if ((target_step < start_step || target_step > end_step) || (target == move.node) || thread_data.lock_manager.is_locked(target) || (not thread_data.affinity_table.is_selected(target))) - continue; + if ((target_step < start_step || target_step > end_step) || (target == move.node) || thread_data.lock_manager.is_locked(target) || (not thread_data.affinity_table.is_selected(target))) + continue; const unsigned target_proc = active_schedule->assigned_processor(target); - if (target_proc == move.from_proc) { + if (target_proc == move.from_proc) { if (max_gain_recompute.find(target) != max_gain_recompute.end()) { // todo more specialized update - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } - + } + const unsigned target_start_idx = start_idx(target_step, start_step); const unsigned target_window_bound = end_idx(target_step, end_step); - auto & affinity_table_target = thread_data.affinity_table.at(target); + auto &affinity_table_target = thread_data.affinity_table.at(target); const cost_t comm_aff = instance->communicationCosts(source_proc, target_proc) * comm_gain; for (const unsigned p : proc_range->compatible_processors_vertex(target)) { if (p == target_proc) - continue; - + continue; + for (unsigned idx = target_start_idx; idx < target_window_bound; idx++) { affinity_table_target[p][idx] -= comm_aff; - } + } } break; // since node_lambda_map[source][move.from_proc] == 1 - } - } + } + } } if (node_lambda_map.get_proc_entry(source, move.to_proc) == 1) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - + for (const auto &target : instance->getComputationalDag().children(source)) { const unsigned target_step = active_schedule->assigned_superstep(target); - if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) - continue; - + if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) + continue; + if (source_proc != move.to_proc && is_compatible(target, move.to_proc)) { if (max_gain_recompute.find(target) != max_gain_recompute.end()) { - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } - + } + const unsigned target_window_bound = end_idx(target_step, end_step); - auto & affinity_table_target_to_proc = thread_data.affinity_table.at(target)[move.to_proc]; + auto &affinity_table_target_to_proc = thread_data.affinity_table.at(target)[move.to_proc]; const cost_t comm_aff = instance->communicationCosts(source_proc, move.to_proc) * comm_gain; for (unsigned idx = start_idx(target_step, start_step); idx < target_window_bound; idx++) { affinity_table_target_to_proc[idx] -= comm_aff; - } + } } } - } else if (node_lambda_map.get_proc_entry(source, move.to_proc) == 2) { + } else if (node_lambda_map.get_proc_entry(source, move.to_proc) == 2) { for (const auto &target : instance->getComputationalDag().children(source)) { const unsigned target_step = active_schedule->assigned_superstep(target); - if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) - continue; - + if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) + continue; + const unsigned target_proc = active_schedule->assigned_processor(target); if (target_proc == move.to_proc) { if (source_proc != target_proc) { if (max_gain_recompute.find(target) != max_gain_recompute.end()) { - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } - + } + const unsigned target_start_idx = start_idx(target_step, start_step); const unsigned target_window_bound = end_idx(target_step, end_step); - auto & affinity_table_target = thread_data.affinity_table.at(target); + auto &affinity_table_target = thread_data.affinity_table.at(target); const cost_t comm_aff = instance->communicationCosts(source_proc, target_proc) * graph->vertex_comm_weight(source) * comm_multiplier; for (const unsigned p : proc_range->compatible_processors_vertex(target)) { if (p == target_proc) - continue; - + continue; + for (unsigned idx = target_start_idx; idx < target_window_bound; idx++) { affinity_table_target[p][idx] += comm_aff; - } + } } } break; - } - } - } + } + } + } } - - const unsigned source_step = active_schedule->assigned_superstep(source); + const unsigned source_step = active_schedule->assigned_superstep(source); if (source_step < start_step || source_step > end_step) continue; - if(thread_data.lock_manager.is_locked(source)) - continue; + if (thread_data.lock_manager.is_locked(source)) + continue; if (not thread_data.affinity_table.is_selected(source)) { new_nodes.push_back(source); @@ -378,111 +377,111 @@ struct kl_hyper_total_comm_cost_function { } if (max_gain_recompute.find(source) != max_gain_recompute.end()) { - max_gain_recompute[source].full_update = true; + max_gain_recompute[source].full_update = true; } else { max_gain_recompute[source] = kl_gain_update_info(source, true); - } + } - const unsigned source_proc = active_schedule->assigned_processor(source); + const unsigned source_proc = active_schedule->assigned_processor(source); const unsigned source_start_idx = start_idx(source_step, start_step); const unsigned window_bound = end_idx(source_step, end_step); - auto & affinity_table_source = thread_data.affinity_table.at(source); + auto &affinity_table_source = thread_data.affinity_table.at(source); if (move.from_step < source_step + (move.from_proc != source_proc)) { - const unsigned diff = source_step - move.from_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = source_step - move.from_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = source_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] += reward; - } + } } if (window_size >= diff && is_compatible(source, move.from_proc)) { - affinity_table_source[move.from_proc][idx] += reward; + affinity_table_source[move.from_proc][idx] += reward; } - } else { + } else { const unsigned diff = move.from_step - source_step; - unsigned idx = window_size + diff; - + unsigned idx = window_size + diff; + if (idx < window_bound && is_compatible(source, move.from_proc)) { - affinity_table_source[move.from_proc][idx] += penalty; + affinity_table_source[move.from_proc][idx] += penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] -= penalty; - } - } + } + } } if (move.to_step < source_step + (move.to_proc != source_proc)) { - const unsigned diff = source_step - move.to_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = source_step - move.to_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = source_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] -= reward; - } + } } if (window_size >= diff && is_compatible(source, move.to_proc)) { - affinity_table_source[move.to_proc][idx] -= reward; + affinity_table_source[move.to_proc][idx] -= reward; } - } else { + } else { const unsigned diff = move.to_step - source_step; - unsigned idx = window_size + diff; + unsigned idx = window_size + diff; if (idx < window_bound && is_compatible(source, move.to_proc)) { - affinity_table_source[move.to_proc][idx] -= penalty; + affinity_table_source[move.to_proc][idx] -= penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] += penalty; - } - } - } - - if (move.to_proc != move.from_proc) { - if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) { + } + } + } + + if (move.to_proc != move.from_proc) { + if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { if (p == source_proc) continue; const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, move.from_proc), instance->communicationCosts(source_proc, move.from_proc), comm_gain); for (unsigned idx = source_start_idx; idx < window_bound; idx++) { affinity_table_source[p][idx] -= comm_cost; - } - } - } + } + } + } if (node_lambda_map.get_proc_entry(source, move.to_proc) == 1) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { if (p == source_proc) continue; const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, move.to_proc), instance->communicationCosts(source_proc, move.to_proc), comm_gain); for (unsigned idx = source_start_idx; idx < window_bound; idx++) { affinity_table_source[p][idx] += comm_cost; - } + } } - } - } - } + } + } + } } inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return node_step < window_size + start_step ? window_size - (node_step - start_step) : 0; } - inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return node_step + window_size <= end_step ? window_range : window_range - (node_step + window_size - end_step); } - inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0;} + inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return node_step + window_size <= end_step ? window_range : window_range - (node_step + window_size - end_step); } + inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0; } template - void compute_comm_affinity(VertexType node, affinity_table_t& affinity_table_node, const cost_t& penalty, const cost_t& reward, const unsigned start_step, const unsigned end_step) { + void compute_comm_affinity(VertexType node, affinity_table_t &affinity_table_node, const cost_t &penalty, const cost_t &reward, const unsigned start_step, const unsigned end_step) { const unsigned node_step = active_schedule->assigned_superstep(node); const unsigned node_proc = active_schedule->assigned_processor(node); const unsigned window_bound = end_idx(node_step, end_step); @@ -490,42 +489,42 @@ struct kl_hyper_total_comm_cost_function { for (const auto &target : instance->getComputationalDag().children(node)) { const unsigned target_step = active_schedule->assigned_superstep(target); - const unsigned target_proc = active_schedule->assigned_processor(target); + const unsigned target_proc = active_schedule->assigned_processor(target); if (target_step < node_step + (target_proc != node_proc)) { - const unsigned diff = node_step - target_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = node_step - target_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = node_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] -= reward; - } + } } if (window_size >= diff && is_compatible(node, target_proc)) { - affinity_table_node[target_proc][idx] -= reward; - } + affinity_table_node[target_proc][idx] -= reward; + } - } else { + } else { const unsigned diff = target_step - node_step; unsigned idx = window_size + diff; if (idx < window_bound && is_compatible(node, target_proc)) { - affinity_table_node[target_proc][idx] -= penalty; + affinity_table_node[target_proc][idx] -= penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] += penalty; - } - } - } + } + } + } } // traget const cost_t comm_gain = graph->vertex_comm_weight(node) * comm_multiplier; - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { if (p == node_proc) continue; @@ -540,21 +539,21 @@ struct kl_hyper_total_comm_cost_function { for (const auto &source : instance->getComputationalDag().parents(node)) { const unsigned source_step = active_schedule->assigned_superstep(source); - const unsigned source_proc = active_schedule->assigned_processor(source); + const unsigned source_proc = active_schedule->assigned_processor(source); if (source_step < node_step + (source_proc == node_proc)) { - const unsigned diff = node_step - source_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; + const unsigned diff = node_step - source_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; unsigned idx = node_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { - affinity_table_node[p][idx] += penalty; - } + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + affinity_table_node[p][idx] += penalty; + } } if (idx - 1 < bound && is_compatible(node, source_proc)) { - affinity_table_node[source_proc][idx - 1] -= penalty; + affinity_table_node[source_proc][idx - 1] -= penalty; } } else { @@ -562,34 +561,34 @@ struct kl_hyper_total_comm_cost_function { unsigned idx = std::min(window_size + diff, window_bound); if (idx < window_bound && is_compatible(node, source_proc)) { - affinity_table_node[source_proc][idx] -= reward; - } + affinity_table_node[source_proc][idx] -= reward; + } idx++; for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] -= reward; - } - } + } + } } const cost_t source_comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { if (p == node_proc) continue; if (source_proc != node_proc && node_lambda_map.get_proc_entry(source, node_proc) == 1) { for (unsigned idx = node_start_idx; idx < window_bound; idx++) { affinity_table_node[p][idx] -= instance->communicationCosts(source_proc, node_proc) * source_comm_gain; - } + } } if (source_proc != p && node_lambda_map.has_no_proc_entry(source, p)) { for (unsigned idx = node_start_idx; idx < window_bound; idx++) { affinity_table_node[p][idx] += instance->communicationCosts(source_proc, p) * source_comm_gain; } - } + } } } // source } diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp index be7c627c..5f471077 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp @@ -22,26 +22,26 @@ limitations under the License. #include "../kl_improver.hpp" namespace osp { -template +template struct kl_total_comm_cost_function { - + using VertexType = vertex_idx_t; using kl_move = kl_move_struct; using kl_gain_update_info = kl_update_info; - + constexpr static bool is_max_comm_cost_function = false; constexpr static unsigned window_range = 2 * window_size + 1; constexpr static bool use_node_communication_costs = use_node_communication_costs_arg || not has_edge_weights_v; - + kl_active_schedule *active_schedule; - compatible_processor_range *proc_range; + CompatibleProcessorRange *proc_range; const Graph_t *graph; const BspInstance *instance; - cost_t comm_multiplier = 1; + cost_t comm_multiplier = 1; cost_t max_comm_weight = 0; inline cost_t get_comm_multiplier() { return comm_multiplier; } @@ -52,23 +52,23 @@ struct kl_total_comm_cost_function { inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); } - void initialize(kl_active_schedule &sched, compatible_processor_range &p_range) { + void initialize(kl_active_schedule &sched, CompatibleProcessorRange &p_range) { active_schedule = &sched; proc_range = &p_range; instance = &sched.getInstance(); graph = &instance->getComputationalDag(); - comm_multiplier = 1.0 / instance->numberOfProcessors(); + comm_multiplier = 1.0 / instance->numberOfProcessors(); } struct empty_struct {}; using pre_move_comm_data_t = empty_struct; - inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); } + inline empty_struct get_pre_move_comm_data(const kl_move &) { return empty_struct(); } cost_t compute_schedule_cost_test() { return compute_schedule_cost(); } - void update_datastructure_after_move(const kl_move&, const unsigned, const unsigned) {} + void update_datastructure_after_move(const kl_move &, const unsigned, const unsigned) {} cost_t compute_schedule_cost() { @@ -89,7 +89,7 @@ struct kl_total_comm_cost_function { if (source_proc != target_proc) { if constexpr (use_node_communication_costs) { - const cost_t source_comm_cost = graph->vertex_comm_weight(source_v); + const cost_t source_comm_cost = graph->vertex_comm_weight(source_v); max_comm_weight = std::max(max_comm_weight, source_comm_cost); comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc); } else { @@ -98,108 +98,108 @@ struct kl_total_comm_cost_function { comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc); } } - } + } return work_costs + comm_costs * comm_multiplier + static_cast>(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); } template - void update_node_comm_affinity(const kl_move &move, thread_data_t& thread_data, const cost_t& penalty, const cost_t& reward, std::map & max_gain_recompute, std::vector &new_nodes) { - - const unsigned & start_step = thread_data.start_step; - const unsigned & end_step = thread_data.end_step; + void update_node_comm_affinity(const kl_move &move, thread_data_t &thread_data, const cost_t &penalty, const cost_t &reward, std::map &max_gain_recompute, std::vector &new_nodes) { + + const unsigned &start_step = thread_data.start_step; + const unsigned &end_step = thread_data.end_step; for (const auto &target : instance->getComputationalDag().children(move.node)) { - const unsigned target_step = active_schedule->assigned_superstep(target); + const unsigned target_step = active_schedule->assigned_superstep(target); if (target_step < start_step || target_step > end_step) continue; - if(thread_data.lock_manager.is_locked(target)) + if (thread_data.lock_manager.is_locked(target)) continue; if (not thread_data.affinity_table.is_selected(target)) { - new_nodes.push_back(target); + new_nodes.push_back(target); continue; } if (max_gain_recompute.find(target) != max_gain_recompute.end()) { - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } + } const unsigned target_proc = active_schedule->assigned_processor(target); - const unsigned target_start_idx = start_idx(target_step, start_step); - auto & affinity_table_target = thread_data.affinity_table.at(target); + const unsigned target_start_idx = start_idx(target_step, start_step); + auto &affinity_table_target = thread_data.affinity_table.at(target); if (move.from_step < target_step + (move.from_proc == target_proc)) { - const unsigned diff = target_step - move.from_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; - unsigned idx = target_start_idx; + const unsigned diff = target_step - move.from_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; + unsigned idx = target_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table_target[p][idx] -= penalty; - } - } + } + } if (idx - 1 < bound && is_compatible(target, move.from_proc)) { - affinity_table_target[move.from_proc][idx - 1] += penalty; + affinity_table_target[move.from_proc][idx - 1] += penalty; } } else { const unsigned diff = move.from_step - target_step; - const unsigned window_bound = end_idx(target_step, end_step); - unsigned idx = std::min(window_size + diff, window_bound); - - if (idx < window_bound && is_compatible(target, move.from_proc)) { - affinity_table_target[move.from_proc][idx] += reward; + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + + if (idx < window_bound && is_compatible(target, move.from_proc)) { + affinity_table_target[move.from_proc][idx] += reward; } idx++; - + for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table_target[p][idx] += reward; - } - } + } + } } if (move.to_step < target_step + (move.to_proc == target_proc)) { - unsigned idx = target_start_idx; - const unsigned diff = target_step - move.to_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; + unsigned idx = target_start_idx; + const unsigned diff = target_step - move.to_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table_target[p][idx] += penalty; - } - } + } + } if (idx - 1 < bound && is_compatible(target, move.to_proc)) { - affinity_table_target[move.to_proc][idx - 1] -= penalty; + affinity_table_target[move.to_proc][idx - 1] -= penalty; } } else { const unsigned diff = move.to_step - target_step; - const unsigned window_bound = end_idx(target_step, end_step); - unsigned idx = std::min(window_size + diff, window_bound); - + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + if (idx < window_bound && is_compatible(target, move.to_proc)) { - affinity_table_target[move.to_proc][idx] -= reward; + affinity_table_target[move.to_proc][idx] -= reward; } idx++; - + for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table_target[p][idx] -= reward; - } - } + } + } } - - if (move.to_proc != move.from_proc) { + + if (move.to_proc != move.from_proc) { const auto from_proc_target_comm_cost = instance->communicationCosts(move.from_proc, target_proc); const auto to_proc_target_comm_cost = instance->communicationCosts(move.to_proc, target_proc); @@ -209,21 +209,21 @@ struct kl_total_comm_cost_function { const unsigned window_bound = end_idx(target_step, end_step); for (; idx < window_bound; idx++) { for (const unsigned p : proc_range->compatible_processors_vertex(target)) { - const auto x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_target_comm_cost, comm_gain); + const auto x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_target_comm_cost, comm_gain); const auto y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_target_comm_cost, comm_gain); - affinity_table_target[p][idx] += x - y; + affinity_table_target[p][idx] += x - y; } } - } + } } for (const auto &source : instance->getComputationalDag().parents(move.node)) { - const unsigned source_step = active_schedule->assigned_superstep(source); + const unsigned source_step = active_schedule->assigned_superstep(source); if (source_step < start_step || source_step > end_step) continue; - if(thread_data.lock_manager.is_locked(source)) + if (thread_data.lock_manager.is_locked(source)) continue; if (not thread_data.affinity_table.is_selected(source)) { @@ -232,75 +232,75 @@ struct kl_total_comm_cost_function { } if (max_gain_recompute.find(source) != max_gain_recompute.end()) { - max_gain_recompute[source].full_update = true; + max_gain_recompute[source].full_update = true; } else { max_gain_recompute[source] = kl_gain_update_info(source, true); - } + } const unsigned source_proc = active_schedule->assigned_processor(source); const unsigned window_bound = end_idx(source_step, end_step); - auto & affinity_table_source = thread_data.affinity_table.at(source); + auto &affinity_table_source = thread_data.affinity_table.at(source); if (move.from_step < source_step + (move.from_proc != source_proc)) { - const unsigned diff = source_step - move.from_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = source_step - move.from_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = start_idx(source_step, start_step); for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] += reward; - } + } } if (window_size >= diff && is_compatible(source, move.from_proc)) { - affinity_table_source[move.from_proc][idx] += reward; + affinity_table_source[move.from_proc][idx] += reward; } - } else { + } else { const unsigned diff = move.from_step - source_step; - unsigned idx = window_size + diff; - + unsigned idx = window_size + diff; + if (idx < window_bound && is_compatible(source, move.from_proc)) { - affinity_table_source[move.from_proc][idx] += penalty; + affinity_table_source[move.from_proc][idx] += penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] -= penalty; - } - } + } + } } if (move.to_step < source_step + (move.to_proc != source_proc)) { - const unsigned diff = source_step - move.to_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = source_step - move.to_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = start_idx(source_step, start_step); for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] -= reward; - } + } } if (window_size >= diff && is_compatible(source, move.to_proc)) { - affinity_table_source[move.to_proc][idx] -= reward; + affinity_table_source[move.to_proc][idx] -= reward; } - } else { + } else { const unsigned diff = move.to_step - source_step; - unsigned idx = window_size + diff; + unsigned idx = window_size + diff; if (idx < window_bound && is_compatible(source, move.to_proc)) { - affinity_table_source[move.to_proc][idx] -= penalty; + affinity_table_source[move.to_proc][idx] -= penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] += penalty; - } - } - } + } + } + } - if (move.to_proc != move.from_proc) { + if (move.to_proc != move.from_proc) { const auto from_proc_source_comm_cost = instance->communicationCosts(source_proc, move.from_proc); const auto to_proc_source_comm_cost = instance->communicationCosts(source_proc, move.to_proc); @@ -308,23 +308,23 @@ struct kl_total_comm_cost_function { unsigned idx = start_idx(source_step, start_step); for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { - const cost_t x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_source_comm_cost, comm_gain); + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + const cost_t x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_source_comm_cost, comm_gain); const cost_t y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_source_comm_cost, comm_gain); - affinity_table_source[p][idx] += x - y; + affinity_table_source[p][idx] += x - y; } } } - } + } } inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0; } inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return (node_step + window_size <= end_step) ? window_range : window_range - (node_step + window_size - end_step); } - inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0;} + inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0; } template - void compute_comm_affinity(VertexType node, affinity_table_t& affinity_table_node, const cost_t& penalty, const cost_t& reward, const unsigned start_step, const unsigned end_step) { + void compute_comm_affinity(VertexType node, affinity_table_t &affinity_table_node, const cost_t &penalty, const cost_t &reward, const unsigned start_step, const unsigned end_step) { const unsigned node_step = active_schedule->assigned_superstep(node); const unsigned node_proc = active_schedule->assigned_processor(node); const unsigned window_bound = end_idx(node_step, end_step); @@ -332,37 +332,37 @@ struct kl_total_comm_cost_function { for (const auto &target : instance->getComputationalDag().children(node)) { const unsigned target_step = active_schedule->assigned_superstep(target); - const unsigned target_proc = active_schedule->assigned_processor(target); + const unsigned target_proc = active_schedule->assigned_processor(target); if (target_step < node_step + (target_proc != node_proc)) { - const unsigned diff = node_step - target_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = node_step - target_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = node_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] -= reward; - } + } } if (window_size >= diff && is_compatible(node, target_proc)) { - affinity_table_node[target_proc][idx] -= reward; - } + affinity_table_node[target_proc][idx] -= reward; + } - } else { + } else { const unsigned diff = target_step - node_step; unsigned idx = window_size + diff; if (idx < window_bound && is_compatible(node, target_proc)) { - affinity_table_node[target_proc][idx] -= penalty; + affinity_table_node[target_proc][idx] -= penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] += penalty; - } - } - } + } + } + } const cost_t comm_gain = graph->vertex_comm_weight(node) * comm_multiplier; const auto node_target_comm_cost = instance->communicationCosts(node_proc, target_proc); @@ -378,21 +378,21 @@ struct kl_total_comm_cost_function { for (const auto &source : instance->getComputationalDag().parents(node)) { const unsigned source_step = active_schedule->assigned_superstep(source); - const unsigned source_proc = active_schedule->assigned_processor(source); + const unsigned source_proc = active_schedule->assigned_processor(source); if (source_step < node_step + (source_proc == node_proc)) { - const unsigned diff = node_step - source_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; + const unsigned diff = node_step - source_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; unsigned idx = node_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { - affinity_table_node[p][idx] += penalty; - } + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + affinity_table_node[p][idx] += penalty; + } } if (idx - 1 < bound && is_compatible(node, source_proc)) { - affinity_table_node[source_proc][idx - 1] -= penalty; + affinity_table_node[source_proc][idx - 1] -= penalty; } } else { @@ -400,22 +400,22 @@ struct kl_total_comm_cost_function { unsigned idx = std::min(window_size + diff, window_bound); if (idx < window_bound && is_compatible(node, source_proc)) { - affinity_table_node[source_proc][idx] -= reward; + affinity_table_node[source_proc][idx] -= reward; } - + idx++; for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] -= reward; - } - } + } + } } const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; const auto source_node_comm_cost = instance->communicationCosts(source_proc, node_proc); - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, source_proc), source_node_comm_cost, comm_gain); for (unsigned idx = node_start_idx; idx < window_bound; idx++) { affinity_table_node[p][idx] += comm_cost; @@ -426,4 +426,3 @@ struct kl_total_comm_cost_function { }; } // namespace osp - diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_cut_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_cut_cost.hpp deleted file mode 100644 index f13abda9..00000000 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_cut_cost.hpp +++ /dev/null @@ -1,431 +0,0 @@ -// /* -// Copyright 2024 Huawei Technologies Co., Ltd. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner -// */ - -// #pragma once - -// #include "../kl_active_schedule.hpp" -// #include "../kl_improver.hpp" - -// namespace osp { -// template -// struct kl_total_cut_cost_function { - -// using VertexType = vertex_idx_t; -// using kl_move = kl_move_struct; -// using kl_gain_update_info = kl_update_info; - -// constexpr static unsigned window_range = 2 * window_size + 1; -// constexpr static bool use_node_communication_costs = use_node_communication_costs_arg || not has_edge_weights_v; - -// kl_active_schedule *active_schedule; - -// compatible_processor_range *proc_range; - -// const Graph_t *graph; -// const BspInstance *instance; - -// cost_t comm_multiplier = 1; -// cost_t max_comm_weight = 0; - -// inline cost_t get_comm_multiplier() { return comm_multiplier; } -// inline cost_t get_max_comm_weight() { return max_comm_weight; } -// inline cost_t get_max_comm_weight_multiplied() { return max_comm_weight * comm_multiplier; } - -// const std::string name() const { return "toal_comm_cost"; } - -// inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); } - -// void initialize(kl_active_schedule &sched, compatible_processor_range &p_range) { -// active_schedule = &sched; -// proc_range = &p_range; -// instance = &sched.getInstance(); -// graph = &instance->getComputationalDag(); -// comm_multiplier = 1.0 / instance->numberOfProcessors(); -// } - -// cost_t compute_schedule_cost_test() { -// return compute_schedule_cost(); -// } - -// void update_datastructure_after_move(const kl_move&, const unsigned, const unsigned) {} - -// cost_t compute_schedule_cost() { - -// cost_t work_costs = 0; -// for (unsigned step = 0; step < active_schedule->num_steps(); step++) { -// work_costs += active_schedule->get_step_max_work(step); -// } - -// cost_t comm_costs = 0; -// for (const auto &edge : edges(*graph)) { - -// const auto &source_v = source(edge, *graph); -// const auto &target_v = target(edge, *graph); - -// const unsigned &source_proc = active_schedule->assigned_processor(source_v); -// const unsigned &target_proc = active_schedule->assigned_processor(target_v); - -// if ((source_proc != target_proc) || (active_schedule->assigned_superstep(source_v) != active_schedule->assigned_superstep(target_v))) { - -// if constexpr (use_node_communication_costs) { -// const cost_t source_comm_cost = graph->vertex_comm_weight(source_v); -// max_comm_weight = std::max(max_comm_weight, source_comm_cost); -// comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc); -// } else { -// const cost_t source_comm_cost = graph->edge_comm_weight(edge); -// max_comm_weight = std::max(max_comm_weight, source_comm_cost); -// comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc); -// } -// } -// } - -// return work_costs + comm_costs * comm_multiplier + static_cast>(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); -// } - -// template -// void update_node_comm_affinity(const kl_move &move, thread_data_t& thread_data, const cost_t& penalty, const cost_t& reward, std::map & max_gain_recompute, std::vector &new_nodes) { - -// const unsigned & start_step = thread_data.start_step; -// const unsigned & end_step = thread_data.end_step; - -// for (const auto &target : instance->getComputationalDag().children(move.node)) { - -// const unsigned target_step = active_schedule->assigned_superstep(target); -// if (target_step < start_step || target_step > end_step) -// continue; - -// if(thread_data.lock_manager.is_locked(target)) -// continue; - -// if (not thread_data.affinity_table.is_selected(target)) { -// new_nodes.push_back(target); -// continue; -// } - -// if (max_gain_recompute.find(target) != max_gain_recompute.end()) { -// max_gain_recompute[target].full_update = true; -// } else { -// max_gain_recompute[target] = kl_gain_update_info(target, true); -// } - -// const unsigned target_proc = active_schedule->assigned_processor(target); -// const unsigned target_start_idx = start_idx(target_step, start_step); -// auto & affinity_table_target = thread_data.affinity_table.at(target); - -// if (move.from_step < target_step + (move.from_proc == target_proc)) { - -// const unsigned diff = target_step - move.from_step; -// const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; -// unsigned idx = target_start_idx; -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// affinity_table_target[p][idx] -= penalty; -// } -// } - -// if (idx - 1 < bound && is_compatible(target, move.from_proc)) { -// affinity_table_target[move.from_proc][idx - 1] += penalty; -// } - -// } else { - -// const unsigned diff = move.from_step - target_step; -// const unsigned window_bound = end_idx(target_step, end_step); -// unsigned idx = std::min(window_size + diff, window_bound); - -// if (idx < window_bound && is_compatible(target, move.from_proc)) { -// affinity_table_target[move.from_proc][idx] += reward; -// } - -// idx++; - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// affinity_table_target[p][idx] += reward; -// } -// } -// } - -// if (move.to_step < target_step + (move.to_proc == target_proc)) { -// unsigned idx = target_start_idx; -// const unsigned diff = target_step - move.to_step; -// const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// affinity_table_target[p][idx] += penalty; -// } -// } - -// if (idx - 1 < bound && is_compatible(target, move.to_proc)) { -// affinity_table_target[move.to_proc][idx - 1] -= penalty; -// } - -// } else { -// const unsigned diff = move.to_step - target_step; -// const unsigned window_bound = end_idx(target_step, end_step); -// unsigned idx = std::min(window_size + diff, window_bound); - -// if (idx < window_bound && is_compatible(target, move.to_proc)) { -// affinity_table_target[move.to_proc][idx] -= reward; -// } - -// idx++; - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// affinity_table_target[p][idx] -= reward; -// } -// } -// } - -// if (move.to_proc != move.from_proc) { -// const auto from_proc_target_comm_cost = instance->communicationCosts(move.from_proc, target_proc); -// const auto to_proc_target_comm_cost = instance->communicationCosts(move.to_proc, target_proc); - -// const cost_t comm_gain = graph->vertex_comm_weight(move.node) * comm_multiplier; - -// unsigned idx = target_start_idx; -// const unsigned window_bound = end_idx(target_step, end_step); -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// const auto x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_target_comm_cost, comm_gain); -// const auto y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_target_comm_cost, comm_gain); -// affinity_table_target[p][idx] += x - y; -// } -// } -// } -// } - -// for (const auto &source : instance->getComputationalDag().parents(move.node)) { - -// const unsigned source_step = active_schedule->assigned_superstep(source); -// if (source_step < start_step || source_step > end_step) -// continue; - -// if(thread_data.lock_manager.is_locked(source)) -// continue; - -// if (not thread_data.affinity_table.is_selected(source)) { -// new_nodes.push_back(source); -// continue; -// } - -// if (max_gain_recompute.find(source) != max_gain_recompute.end()) { -// max_gain_recompute[source].full_update = true; -// } else { -// max_gain_recompute[source] = kl_gain_update_info(source, true); -// } - -// const unsigned source_proc = active_schedule->assigned_processor(source); -// const unsigned window_bound = end_idx(source_step, end_step); -// auto & affinity_table_source = thread_data.affinity_table.at(source); - -// if (move.from_step < source_step + (move.from_proc != source_proc)) { - -// const unsigned diff = source_step - move.from_step; -// const unsigned bound = window_size > diff ? window_size - diff : 0; -// unsigned idx = start_idx(source_step, start_step); -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// affinity_table_source[p][idx] += reward; -// } -// } - -// if (window_size >= diff && is_compatible(source, move.from_proc)) { -// affinity_table_source[move.from_proc][idx] += reward; -// } - -// } else { - -// const unsigned diff = move.from_step - source_step; -// unsigned idx = window_size + diff; - -// if (idx < window_bound && is_compatible(source, move.from_proc)) { -// affinity_table_source[move.from_proc][idx] += penalty; -// } - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// affinity_table_source[p][idx] -= penalty; -// } -// } -// } - -// if (move.to_step < source_step + (move.to_proc != source_proc)) { -// const unsigned diff = source_step - move.to_step; -// const unsigned bound = window_size > diff ? window_size - diff : 0; -// unsigned idx = start_idx(source_step, start_step); -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// affinity_table_source[p][idx] -= reward; -// } -// } - -// if (window_size >= diff && is_compatible(source, move.to_proc)) { -// affinity_table_source[move.to_proc][idx] -= reward; -// } - -// } else { -// const unsigned diff = move.to_step - source_step; -// unsigned idx = window_size + diff; - -// if (idx < window_bound && is_compatible(source, move.to_proc)) { -// affinity_table_source[move.to_proc][idx] -= penalty; -// } -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// affinity_table_source[p][idx] += penalty; -// } -// } -// } - -// if (move.to_proc != move.from_proc) { -// const auto from_proc_source_comm_cost = instance->communicationCosts(source_proc, move.from_proc); -// const auto to_proc_source_comm_cost = instance->communicationCosts(source_proc, move.to_proc); - -// const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - -// unsigned idx = start_idx(source_step, start_step); -// const unsigned window_bound = end_idx(source_step, end_step); -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// const cost_t x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_source_comm_cost, comm_gain); -// const cost_t y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_source_comm_cost, comm_gain); -// affinity_table_source[p][idx] += x - y; -// } -// } -// } -// } -// } - -// inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0; } -// inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return (node_step + window_size <= end_step) ? window_range : window_range - (node_step + window_size - end_step); } - -// inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0;} - -// template -// void compute_comm_affinity(VertexType node, affinity_table_t& affinity_table_node, const cost_t& penalty, const cost_t& reward, const unsigned start_step, const unsigned end_step) { -// const unsigned node_step = active_schedule->assigned_superstep(node); -// const unsigned node_proc = active_schedule->assigned_processor(node); -// const unsigned window_bound = end_idx(node_step, end_step); -// const unsigned node_start_idx = start_idx(node_step, start_step); - -// for (const auto &target : instance->getComputationalDag().children(node)) { -// const unsigned target_step = active_schedule->assigned_superstep(target); -// const unsigned target_proc = active_schedule->assigned_processor(target); - -// if (target_step < node_step + (target_proc != node_proc)) { -// const unsigned diff = node_step - target_step; -// const unsigned bound = window_size > diff ? window_size - diff : 0; -// unsigned idx = node_start_idx; - -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// affinity_table_node[p][idx] -= reward; -// } -// } - -// if (window_size >= diff && is_compatible(node, target_proc)) { -// affinity_table_node[target_proc][idx] -= reward; -// } - -// } else { -// const unsigned diff = target_step - node_step; -// unsigned idx = window_size + diff; - -// if (idx < window_bound && is_compatible(node, target_proc)) { -// affinity_table_node[target_proc][idx] -= penalty; -// } - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// affinity_table_node[p][idx] += penalty; -// } -// } -// } - -// const cost_t comm_gain = graph->vertex_comm_weight(node) * comm_multiplier; -// const auto node_target_comm_cost = instance->communicationCosts(node_proc, target_proc); - -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// if (p != target_proc) { -// const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, target_proc), node_target_comm_cost, comm_gain); -// for (unsigned idx = node_start_idx; idx < window_bound; idx++) { -// affinity_table_node[p][idx] += comm_cost; -// } -// } else { -// for (unsigned idx = node_start_idx; idx < window_bound; idx++) { -// if(idx == 0) continue; -// affinity_table_node[p][idx] += comm_gain; -// } -// } -// } - -// } // traget - -// for (const auto &source : instance->getComputationalDag().parents(node)) { -// const unsigned source_step = active_schedule->assigned_superstep(source); -// const unsigned source_proc = active_schedule->assigned_processor(source); - -// if (source_step < node_step + (source_proc == node_proc)) { -// const unsigned diff = node_step - source_step; -// const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; -// unsigned idx = node_start_idx; - -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// affinity_table_node[p][idx] += penalty; -// } -// } - -// if (idx - 1 < bound && is_compatible(node, source_proc)) { -// affinity_table_node[source_proc][idx - 1] -= penalty; -// } - -// } else { -// const unsigned diff = source_step - node_step; -// unsigned idx = std::min(window_size + diff, window_bound); - -// if (idx < window_bound && is_compatible(node, source_proc)) { -// affinity_table_node[source_proc][idx] -= reward; -// } - -// idx++; - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// affinity_table_node[p][idx] -= reward; -// } -// } -// } - -// const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; -// const auto source_node_comm_cost = instance->communicationCosts(source_proc, node_proc); - -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, source_proc), source_node_comm_cost, comm_gain); -// for (unsigned idx = node_start_idx; idx < window_bound; idx++) { -// affinity_table_node[p][idx] += comm_cost; -// } -// } -// } // source -// } -// }; - -// } // namespace osp - diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_active_schedule.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_active_schedule.hpp index 6fe460f8..862eeacc 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_active_schedule.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_active_schedule.hpp @@ -16,13 +16,12 @@ limitations under the License. @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner */ - #pragma once #include "osp/bsp/model/BspSchedule.hpp" #include "osp/bsp/model/IBspSchedule.hpp" -#include "osp/bsp/model/SetSchedule.hpp" -#include "osp/bsp/model/VectorSchedule.hpp" +#include "osp/bsp/model/util/SetSchedule.hpp" +#include "osp/bsp/model/util/VectorSchedule.hpp" #include "osp/bsp/scheduler/ImprovementScheduler.hpp" #include "osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp" #include "osp/graph_algorithms/directed_graph_util.hpp" @@ -54,7 +53,7 @@ struct kl_move_struct { bool operator>(kl_move_struct const &rhs) const { return (gain > rhs.gain) or (gain >= rhs.gain and node < rhs.node); } - + kl_move_struct reverse_move() const { return kl_move_struct(node, -gain, to_proc, to_step, from_proc, from_step); } @@ -73,13 +72,12 @@ struct pre_move_work_data { pre_move_work_data() {} pre_move_work_data(work_weight_t from_step_max_work_, work_weight_t from_step_second_max_work_, unsigned from_step_max_work_processor_count_, - work_weight_t to_step_max_work_, work_weight_t to_step_second_max_work_, - unsigned to_step_max_work_processor_count_) + work_weight_t to_step_max_work_, work_weight_t to_step_second_max_work_, + unsigned to_step_max_work_processor_count_) : from_step_max_work(from_step_max_work_), from_step_second_max_work(from_step_second_max_work_), from_step_max_work_processor_count(from_step_max_work_processor_count_), to_step_max_work(to_step_max_work_), to_step_second_max_work(to_step_second_max_work_), - to_step_max_work_processor_count(to_step_max_work_processor_count_) {} - + to_step_max_work_processor_count(to_step_max_work_processor_count_) {} }; template @@ -87,16 +85,16 @@ struct kl_active_schedule_work_datastructures { using work_weight_t = v_workw_t; - const BspInstance *instance; + const BspInstance *instance; const SetSchedule *set_schedule; - + struct weight_proc { work_weight_t work; unsigned proc; weight_proc() : work(0), proc(0) {} weight_proc(work_weight_t _work, unsigned _proc) : work(_work), proc(_proc) {} - + bool operator<(weight_proc const &rhs) const { return (work > rhs.work) or (work == rhs.work and proc < rhs.proc); } @@ -106,17 +104,17 @@ struct kl_active_schedule_work_datastructures { std::vector> step_processor_position; std::vector step_max_work_processor_count; work_weight_t max_work_weight; - work_weight_t total_work_weight; + work_weight_t total_work_weight; inline work_weight_t step_max_work(unsigned step) const { return step_processor_work_[step][0].work; } inline work_weight_t step_second_max_work(unsigned step) const { return step_processor_work_[step][step_max_work_processor_count[step]].work; } inline work_weight_t step_proc_work(unsigned step, unsigned proc) const { return step_processor_work_[step][step_processor_position[step][proc]].work; } - inline work_weight_t & step_proc_work(unsigned step, unsigned proc) { return step_processor_work_[step][step_processor_position[step][proc]].work; } + inline work_weight_t &step_proc_work(unsigned step, unsigned proc) { return step_processor_work_[step][step_processor_position[step][proc]].work; } template - inline pre_move_work_data get_pre_move_work_data(kl_move_struct move) { + inline pre_move_work_data get_pre_move_work_data(kl_move_struct move) { return pre_move_work_data(step_max_work(move.from_step), step_second_max_work(move.from_step), step_max_work_processor_count[move.from_step], - step_max_work(move.to_step), step_second_max_work(move.to_step), step_max_work_processor_count[move.to_step]); + step_max_work(move.to_step), step_second_max_work(move.to_step), step_max_work_processor_count[move.to_step]); } inline void initialize(const SetSchedule &sched, const BspInstance &inst, unsigned num_steps) { @@ -140,20 +138,20 @@ struct kl_active_schedule_work_datastructures { unsigned pos = 0; const work_weight_t max_work_to = step_processor_work_[step][0].work; - for (const auto & wp : step_processor_work_[step]) { + for (const auto &wp : step_processor_work_[step]) { step_processor_position[step][wp.proc] = pos++; if (wp.work == max_work_to && pos < instance->numberOfProcessors()) - step_max_work_processor_count[step] = pos; + step_max_work_processor_count[step] = pos; } } template - void apply_move(kl_move_struct move, work_weight_t work_weight) { + void apply_move(kl_move_struct move, work_weight_t work_weight) { - if (work_weight == 0) + if (work_weight == 0) return; - + if (move.to_step != move.from_step) { step_proc_work(move.to_step, move.to_proc) += work_weight; step_proc_work(move.from_step, move.from_proc) -= work_weight; @@ -171,7 +169,7 @@ struct kl_active_schedule_work_datastructures { // } // unsigned to_proc_pos = step_processor_position[move.to_step][move.to_proc]; - + // while (to_proc_pos > 0 && step_processor_work_[move.to_step][to_proc_pos - 1].work < new_weight_to) { // std::swap(step_processor_work_[move.to_step][to_proc_pos], step_processor_work_[move.to_step][to_proc_pos - 1]); // std::swap(step_processor_position[move.to_step][step_processor_work_[move.to_step][to_proc_pos].proc], step_processor_position[move.to_step][step_processor_work_[move.to_step][to_proc_pos - 1].proc]); @@ -189,15 +187,15 @@ struct kl_active_schedule_work_datastructures { // std::swap(step_processor_position[move.from_step][step_processor_work_[move.from_step][from_proc_pos].proc], step_processor_position[move.from_step][step_processor_work_[move.from_step][from_proc_pos + 1].proc]); // from_proc_pos++; // } - + // if (prev_max_work_from == prev_weight_from) { - // step_max_work_processor_count[move.from_step]--; - // if (step_max_work_processor_count[move.from_step] == 0) { - // step_max_work_processor_count[move.from_step] = from_proc_pos; + // step_max_work_processor_count[move.from_step]--; + // if (step_max_work_processor_count[move.from_step] == 0) { + // step_max_work_processor_count[move.from_step] = from_proc_pos; // } - // } + // } - } else { + } else { step_proc_work(move.to_step, move.to_proc) += work_weight; step_proc_work(move.from_step, move.from_proc) -= work_weight; arrange_superstep_data(move.to_step); @@ -209,21 +207,21 @@ struct kl_active_schedule_work_datastructures { std::swap(step_processor_position[step1], step_processor_position[step2]); std::swap(step_max_work_processor_count[step1], step_max_work_processor_count[step2]); } - + void override_next_superstep(unsigned step) { const unsigned next_step = step + 1; for (unsigned i = 0; i < instance->numberOfProcessors(); i++) { - step_processor_work_[next_step][i] = step_processor_work_[step][i]; - step_processor_position[next_step][i] = step_processor_position[step][i]; + step_processor_work_[next_step][i] = step_processor_work_[step][i]; + step_processor_position[next_step][i] = step_processor_position[step][i]; } step_max_work_processor_count[next_step] = step_max_work_processor_count[step]; } void reset_superstep(unsigned step) { for (unsigned i = 0; i < instance->numberOfProcessors(); i++) { - step_processor_work_[step][i] = {0,i}; - step_processor_position[step][i] = i; + step_processor_work_[step][i] = {0, i}; + step_processor_position[step][i] = i; } step_max_work_processor_count[step] = instance->numberOfProcessors() - 1; } @@ -249,12 +247,12 @@ struct kl_active_schedule_work_datastructures { step_max_work_processor_count[step] = 1; } else if (step_processor_work_[step][proc].work == max_work && step_max_work_processor_count[step] < (instance->numberOfProcessors() - 1)) { step_max_work_processor_count[step]++; - } + } } std::sort(step_processor_work_[step].begin(), step_processor_work_[step].end()); unsigned pos = 0; - for (const auto & wp : step_processor_work_[step]) { + for (const auto &wp : step_processor_work_[step]) { step_processor_position[step][wp.proc] = pos++; } } @@ -287,15 +285,15 @@ struct thread_local_active_schedule_data { cost = cost_; best_cost = cost_; feasible = true; - } - + } + inline void update_cost(cost_t change_in_cost) { - cost += change_in_cost; + cost += change_in_cost; if (cost <= best_cost && feasible) { best_cost = cost; best_schedule_idx = static_cast(applied_moves.size()); - } + } } }; @@ -319,23 +317,23 @@ class kl_active_schedule { public: virtual ~kl_active_schedule() = default; - inline const BspInstance & getInstance() const { return *instance; } - inline const VectorSchedule & getVectorSchedule() const { return vector_schedule; } - inline VectorSchedule & getVectorSchedule() { return vector_schedule; } - inline const SetSchedule & getSetSchedule() const { return set_schedule; } + inline const BspInstance &getInstance() const { return *instance; } + inline const VectorSchedule &getVectorSchedule() const { return vector_schedule; } + inline VectorSchedule &getVectorSchedule() { return vector_schedule; } + inline const SetSchedule &getSetSchedule() const { return set_schedule; } inline cost_t get_cost() { return cost; } inline bool is_feasible() { return feasible; } inline unsigned num_steps() const { return vector_schedule.numberOfSupersteps(); } inline unsigned assigned_processor(VertexType node) const { return vector_schedule.assignedProcessor(node); } inline unsigned assigned_superstep(VertexType node) const { return vector_schedule.assignedSuperstep(node); } - inline v_workw_t get_step_max_work(unsigned step) const {return work_datastructures.step_max_work(step); } - inline v_workw_t get_step_second_max_work(unsigned step) const {return work_datastructures.step_second_max_work(step); } - inline std::vector & get_step_max_work_processor_count() {return work_datastructures.step_max_work_processor_count; } - inline v_workw_t get_step_processor_work(unsigned step, unsigned proc) const {return work_datastructures.step_proc_work(step, proc); } + inline v_workw_t get_step_max_work(unsigned step) const { return work_datastructures.step_max_work(step); } + inline v_workw_t get_step_second_max_work(unsigned step) const { return work_datastructures.step_second_max_work(step); } + inline std::vector &get_step_max_work_processor_count() { return work_datastructures.step_max_work_processor_count; } + inline v_workw_t get_step_processor_work(unsigned step, unsigned proc) const { return work_datastructures.step_proc_work(step, proc); } inline pre_move_work_data> get_pre_move_work_data(kl_move move) { return work_datastructures.get_pre_move_work_data(move); } inline v_workw_t get_max_work_weight() { return work_datastructures.max_work_weight; } inline v_workw_t get_total_work_weight() { return work_datastructures.total_work_weight; } - inline void set_cost(cost_t cost_) { cost = cost_; } + inline void set_cost(cost_t cost_) { cost = cost_; } constexpr static bool use_memory_constraint = is_local_search_memory_constraint_v; @@ -343,11 +341,11 @@ class kl_active_schedule { kl_active_schedule_work_datastructures work_datastructures; - inline v_workw_t get_step_total_work(unsigned step) const { - v_workw_t total_work = 0; + inline v_workw_t get_step_total_work(unsigned step) const { + v_workw_t total_work = 0; for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { total_work += get_step_processor_work(step, proc); - } + } return total_work; } @@ -357,18 +355,18 @@ class kl_active_schedule { set_schedule.step_processor_vertices[move.from_step][move.from_proc].erase(move.node); set_schedule.step_processor_vertices[move.to_step][move.to_proc].insert(move.node); - + update_violations(move.node, thread_data); thread_data.applied_moves.push_back(move); work_datastructures.apply_move(move, instance->getComputationalDag().vertex_work_weight(move.node)); if constexpr (use_memory_constraint) { memory_constraint.apply_move(move.node, move.from_proc, move.from_step, move.to_proc, move.to_step); - } + } } template - void revert_to_best_schedule(unsigned start_move, unsigned insert_step, comm_datastructures_t & comm_datastructures, thread_data_t & thread_data, unsigned start_step, unsigned & end_step) { + void revert_to_best_schedule(unsigned start_move, unsigned insert_step, comm_datastructures_t &comm_datastructures, thread_data_t &thread_data, unsigned start_step, unsigned &end_step) { const unsigned bound = std::max(start_move, thread_data.best_schedule_idx); revert_moves(bound, comm_datastructures, thread_data, start_step, end_step); @@ -391,7 +389,7 @@ class kl_active_schedule { } template - void revert_schedule_to_bound(const size_t bound, const cost_t new_cost, const bool is_feasible, comm_datastructures_t & comm_datastructures, thread_data_t & thread_data, unsigned start_step, unsigned end_step) { + void revert_schedule_to_bound(const size_t bound, const cost_t new_cost, const bool is_feasible, comm_datastructures_t &comm_datastructures, thread_data_t &thread_data, unsigned start_step, unsigned end_step) { revert_moves(bound, comm_datastructures, thread_data, start_step, end_step); thread_data.current_violations.clear(); @@ -399,10 +397,9 @@ class kl_active_schedule { thread_data.cost = new_cost; } - - void compute_violations(thread_data_t & thread_data); + void compute_violations(thread_data_t &thread_data); void compute_work_memory_datastructures(unsigned start_step, unsigned end_step); - void write_schedule (BspSchedule &schedule); + void write_schedule(BspSchedule &schedule); inline void initialize(const IBspSchedule &schedule); inline void clear(); void remove_empty_step(unsigned step); @@ -412,15 +409,14 @@ class kl_active_schedule { void swap_steps(const unsigned step1, const unsigned step2); private: - template - void revert_moves(const size_t bound, comm_datastructures_t & comm_datastructures, thread_data_t & thread_data, unsigned start_step, unsigned end_step) { + void revert_moves(const size_t bound, comm_datastructures_t &comm_datastructures, thread_data_t &thread_data, unsigned start_step, unsigned end_step) { while (thread_data.applied_moves.size() > bound) { const auto move = thread_data.applied_moves.back().reverse_move(); thread_data.applied_moves.pop_back(); vector_schedule.setAssignedProcessor(move.node, move.to_proc); - vector_schedule.setAssignedSuperstep(move.node, move.to_step); + vector_schedule.setAssignedSuperstep(move.node, move.to_step); set_schedule.step_processor_vertices[move.from_step][move.from_proc].erase(move.node); set_schedule.step_processor_vertices[move.to_step][move.to_proc].insert(move.node); @@ -443,16 +439,16 @@ class kl_active_schedule { const auto &child = target(edge, instance->getComputationalDag()); if (thread_data.current_violations.find(edge) == thread_data.current_violations.end()) { - if ((node_step > vector_schedule.assignedSuperstep(child)) || + if ((node_step > vector_schedule.assignedSuperstep(child)) || (node_step == vector_schedule.assignedSuperstep(child) && node_proc != vector_schedule.assignedProcessor(child))) { - thread_data.current_violations.insert(edge); - thread_data.new_violations[child] = edge; + thread_data.current_violations.insert(edge); + thread_data.new_violations[child] = edge; } } else { - if ((node_step < vector_schedule.assignedSuperstep(child)) || + if ((node_step < vector_schedule.assignedSuperstep(child)) || (node_step == vector_schedule.assignedSuperstep(child) && node_proc == vector_schedule.assignedProcessor(child))) { - thread_data.current_violations.erase(edge); - thread_data.resolved_violations.insert(edge); + thread_data.current_violations.erase(edge); + thread_data.resolved_violations.insert(edge); } } } @@ -460,17 +456,17 @@ class kl_active_schedule { for (const auto &edge : in_edges(node, instance->getComputationalDag())) { const auto &parent = source(edge, instance->getComputationalDag()); - if (thread_data.current_violations.find(edge) == thread_data.current_violations.end()) { - if ((node_step < vector_schedule.assignedSuperstep(parent)) || + if (thread_data.current_violations.find(edge) == thread_data.current_violations.end()) { + if ((node_step < vector_schedule.assignedSuperstep(parent)) || (node_step == vector_schedule.assignedSuperstep(parent) && node_proc != vector_schedule.assignedProcessor(parent))) { - thread_data.current_violations.insert(edge); - thread_data.new_violations[parent] = edge; + thread_data.current_violations.insert(edge); + thread_data.new_violations[parent] = edge; } } else { - if ((node_step > vector_schedule.assignedSuperstep(parent)) || + if ((node_step > vector_schedule.assignedSuperstep(parent)) || (node_step == vector_schedule.assignedSuperstep(parent) && node_proc == vector_schedule.assignedProcessor(parent))) { - thread_data.current_violations.erase(edge); - thread_data.resolved_violations.insert(edge); + thread_data.current_violations.erase(edge); + thread_data.resolved_violations.insert(edge); } } } @@ -501,7 +497,6 @@ class kl_active_schedule { thread_data.feasible = true; } } - }; template @@ -515,7 +510,7 @@ void kl_active_schedule::clear() { } template -void kl_active_schedule::compute_violations(thread_data_t & thread_data) { +void kl_active_schedule::compute_violations(thread_data_t &thread_data) { thread_data.current_violations.clear(); thread_data.feasible = true; @@ -529,12 +524,12 @@ void kl_active_schedule::compute_violations const unsigned target_proc = assigned_processor(target_v); const unsigned source_step = assigned_superstep(source_v); const unsigned target_step = assigned_superstep(target_v); - + if (source_step > target_step || (source_step == target_step && source_proc != target_proc)) { thread_data.current_violations.insert(edge); thread_data.feasible = false; - } - } + } + } } template @@ -563,7 +558,7 @@ void kl_active_schedule::compute_work_memor } template -void kl_active_schedule::write_schedule (BspSchedule &schedule) { +void kl_active_schedule::write_schedule(BspSchedule &schedule) { for (const auto v : instance->vertices()) { schedule.setAssignedProcessor(v, vector_schedule.assignedProcessor(v)); schedule.setAssignedSuperstep(v, vector_schedule.assignedSuperstep(v)); @@ -572,91 +567,92 @@ void kl_active_schedule::write_schedule (Bs } template -void kl_active_schedule::remove_empty_step(unsigned step) { +void kl_active_schedule::remove_empty_step(unsigned step) { for (unsigned i = step; i < num_steps() - 1; i++) { - for(unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - for (const auto node : set_schedule.step_processor_vertices[i + 1][proc]){ + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + for (const auto node : set_schedule.step_processor_vertices[i + 1][proc]) { vector_schedule.setAssignedSuperstep(node, i); } } std::swap(set_schedule.step_processor_vertices[i], set_schedule.step_processor_vertices[i + 1]); - work_datastructures.swap_steps(i, i+1); + work_datastructures.swap_steps(i, i + 1); if constexpr (use_memory_constraint) { - memory_constraint.swap_steps(i, i+1); + memory_constraint.swap_steps(i, i + 1); } } vector_schedule.number_of_supersteps--; } template -void kl_active_schedule::swap_empty_step_fwd(const unsigned step, const unsigned to_step) { +void kl_active_schedule::swap_empty_step_fwd(const unsigned step, const unsigned to_step) { for (unsigned i = step; i < to_step; i++) { - for(unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - for (const auto node : set_schedule.step_processor_vertices[i + 1][proc]){ + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + for (const auto node : set_schedule.step_processor_vertices[i + 1][proc]) { vector_schedule.setAssignedSuperstep(node, i); } } std::swap(set_schedule.step_processor_vertices[i], set_schedule.step_processor_vertices[i + 1]); work_datastructures.swap_steps(i, i + 1); if constexpr (use_memory_constraint) { - memory_constraint.swap_steps(i, i+1); + memory_constraint.swap_steps(i, i + 1); } } } template void kl_active_schedule::insert_empty_step(unsigned step) { - unsigned i = vector_schedule.number_of_supersteps++; - + unsigned i = vector_schedule.number_of_supersteps++; + for (; i > step; i--) { - for(unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - for (const auto node : set_schedule.step_processor_vertices[i-1][proc]){ + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + for (const auto node : set_schedule.step_processor_vertices[i - 1][proc]) { vector_schedule.setAssignedSuperstep(node, i); } } std::swap(set_schedule.step_processor_vertices[i], set_schedule.step_processor_vertices[i - 1]); - work_datastructures.swap_steps(i-1, i); + work_datastructures.swap_steps(i - 1, i); if constexpr (use_memory_constraint) { - memory_constraint.swap_steps(i - 1, i); + memory_constraint.swap_steps(i - 1, i); } - } + } } template void kl_active_schedule::swap_empty_step_bwd(const unsigned to_step, const unsigned empty_step) { - unsigned i = to_step; - + unsigned i = to_step; + for (; i > empty_step; i--) { - for(unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - for (const auto node : set_schedule.step_processor_vertices[i-1][proc]){ + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + for (const auto node : set_schedule.step_processor_vertices[i - 1][proc]) { vector_schedule.setAssignedSuperstep(node, i); } } std::swap(set_schedule.step_processor_vertices[i], set_schedule.step_processor_vertices[i - 1]); - work_datastructures.swap_steps(i-1, i); + work_datastructures.swap_steps(i - 1, i); if constexpr (use_memory_constraint) { - memory_constraint.swap_steps(i - 1, i); + memory_constraint.swap_steps(i - 1, i); } - } + } } template void kl_active_schedule::swap_steps(const unsigned step1, const unsigned step2) { - if (step1 == step2) return; + if (step1 == step2) + return; - for(unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { - for (const auto node : set_schedule.step_processor_vertices[step1][proc]){ + for (unsigned proc = 0; proc < instance->numberOfProcessors(); proc++) { + for (const auto node : set_schedule.step_processor_vertices[step1][proc]) { vector_schedule.setAssignedSuperstep(node, step2); } - for (const auto node : set_schedule.step_processor_vertices[step2][proc]){ + for (const auto node : set_schedule.step_processor_vertices[step2][proc]) { vector_schedule.setAssignedSuperstep(node, step1); } } std::swap(set_schedule.step_processor_vertices[step1], set_schedule.step_processor_vertices[step2]); - work_datastructures.swap_steps(step1, step2); + work_datastructures.swap_steps(step1, step2); if constexpr (use_memory_constraint) { memory_constraint.swap_steps(step1, step2); - } + } } } // namespace osp diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp index 97bd35a7..dd572710 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp @@ -30,6 +30,7 @@ limitations under the License. #include "osp/auxiliary/datastructures/heaps/PairingHeap.hpp" #include "osp/auxiliary/misc.hpp" +#include "osp/bsp/model/util/CompatibleProcessorRange.hpp" #include "osp/bsp/scheduler/ImprovementScheduler.hpp" #include "osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp" #include "osp/graph_algorithms/directed_graph_edge_desc_util.hpp" @@ -152,7 +153,7 @@ class kl_improver : public ImprovementScheduler { const Graph_t *graph; const BspInstance *instance; - compatible_processor_range proc_range; + CompatibleProcessorRange proc_range; kl_parameter parameters; std::mt19937 gen; diff --git a/include/osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp b/include/osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp index 6961ef92..2cee3d0f 100644 --- a/include/osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/LocalSearchMemoryConstraintModules.hpp @@ -19,8 +19,8 @@ limitations under the License. #pragma once #include "osp/bsp/model/BspSchedule.hpp" -#include "osp/bsp/model/SetSchedule.hpp" -#include "osp/bsp/model/VectorSchedule.hpp" +#include "osp/bsp/model/util/SetSchedule.hpp" +#include "osp/bsp/model/util/VectorSchedule.hpp" #include "osp/graph_algorithms/directed_graph_util.hpp" namespace osp { @@ -42,7 +42,7 @@ struct is_local_search_memory_constraint< std::declval(), std::declval(), std::declval(), std::declval())), decltype(std::declval().compute_memory_datastructure(std::declval(), - std::declval())), + std::declval())), decltype(std::declval().swap_steps(std::declval(), std::declval())), decltype(std::declval().reset_superstep(std::declval())), decltype(std::declval().override_superstep(std::declval(), std::declval(), @@ -105,7 +105,7 @@ struct ls_local_memory_constraint { void swap_steps(const unsigned step1, const unsigned step2) { std::swap(step_processor_memory[step1], step_processor_memory[step2]); - } + } void compute_memory_datastructure(unsigned start_step, unsigned end_step) { @@ -150,7 +150,7 @@ struct ls_local_memory_constraint { } } return true; - } + } }; template @@ -378,7 +378,7 @@ struct ls_local_sources_inc_edges_memory_constraint { inline void swap_steps(const unsigned step1, const unsigned step2) { std::swap(step_processor_memory[step1], step_processor_memory[step2]); std::swap(step_processor_pred[step1], step_processor_pred[step2]); - } + } inline void initialize(const SetSchedule &set_schedule_, const VectorSchedule &vec_schedule_) { @@ -587,7 +587,6 @@ struct ls_local_sources_inc_edges_memory_constraint { } return true; - } }; diff --git a/include/osp/bsp/scheduler/Scheduler.hpp b/include/osp/bsp/scheduler/Scheduler.hpp index a57e2e84..fa458ba9 100644 --- a/include/osp/bsp/scheduler/Scheduler.hpp +++ b/include/osp/bsp/scheduler/Scheduler.hpp @@ -18,6 +18,7 @@ limitations under the License. #pragma once +#include "osp/auxiliary/return_status.hpp" #include "osp/bsp/model/BspInstance.hpp" #include "osp/bsp/model/BspSchedule.hpp" #include "osp/bsp/model/BspScheduleCS.hpp" diff --git a/include/osp/coarser/BspScheduleCoarser.hpp b/include/osp/coarser/BspScheduleCoarser.hpp index 64684b7a..ea4cf9f9 100644 --- a/include/osp/coarser/BspScheduleCoarser.hpp +++ b/include/osp/coarser/BspScheduleCoarser.hpp @@ -18,10 +18,10 @@ limitations under the License. #pragma once -#include "osp/coarser/Coarser.hpp" #include "osp/bsp/model/BspSchedule.hpp" -#include "osp/bsp/model/SetSchedule.hpp" +#include "osp/bsp/model/util/SetSchedule.hpp" #include "osp/bsp/scheduler/Scheduler.hpp" +#include "osp/coarser/Coarser.hpp" #include "osp/graph_algorithms/directed_graph_edge_desc_util.hpp" namespace osp { @@ -63,7 +63,6 @@ class BspScheduleCoarser : public CoarserGenContractionMapgetInstance().getComputationalDag()); assert(schedule->satisfiesPrecedenceConstraints()); - SetSchedule set_schedule(*schedule); std::vector reverse_vertex_map(dag_in.num_vertices(), 0); std::vector> vertex_map; diff --git a/include/osp/coarser/MultilevelCoarser.hpp b/include/osp/coarser/MultilevelCoarser.hpp index bbd090e4..f8a1434e 100644 --- a/include/osp/coarser/MultilevelCoarser.hpp +++ b/include/osp/coarser/MultilevelCoarser.hpp @@ -23,11 +23,11 @@ limitations under the License. #include #include -#include "osp/coarser/Coarser.hpp" +#include "osp/auxiliary/return_status.hpp" #include "osp/bsp/model/BspInstance.hpp" +#include "osp/coarser/Coarser.hpp" #include "osp/coarser/coarser_util.hpp" - namespace osp { template @@ -36,10 +36,12 @@ class MultilevelCoarseAndSchedule; template class MultilevelCoarser : public Coarser { friend class MultilevelCoarseAndSchedule; + private: const Graph_t *original_graph; + protected: - inline const Graph_t * getOriginalGraph() const { return original_graph; }; + inline const Graph_t *getOriginalGraph() const { return original_graph; }; std::vector> dag_history; std::vector>>> contraction_maps; @@ -49,7 +51,7 @@ class MultilevelCoarser : public Coarser { RETURN_STATUS add_contraction(const std::vector> &contraction_map, const Graph_t_coarse &contracted_graph); RETURN_STATUS add_contraction(std::vector> &&contraction_map, Graph_t_coarse &&contracted_graph); void add_identity_contraction(); - + std::vector> getCombinedContractionMap() const; virtual RETURN_STATUS run_contractions() = 0; @@ -62,19 +64,15 @@ class MultilevelCoarser : public Coarser { MultilevelCoarser(const Graph_t &graph) : original_graph(&graph) {}; virtual ~MultilevelCoarser() = default; - bool coarsenDag(const Graph_t &dag_in, Graph_t_coarse &coarsened_dag, - std::vector> &vertex_contraction_map) override; + std::vector> &vertex_contraction_map) override; - RETURN_STATUS run(const Graph_t &graph); RETURN_STATUS run(const BspInstance &inst); virtual std::string getCoarserName() const override = 0; }; - - template RETURN_STATUS MultilevelCoarser::run(const Graph_t &graph) { clear_computation_data(); @@ -91,7 +89,7 @@ RETURN_STATUS MultilevelCoarser::run(const Graph_t &gra } template -RETURN_STATUS MultilevelCoarser::run(const BspInstance< Graph_t > &inst) { +RETURN_STATUS MultilevelCoarser::run(const BspInstance &inst) { return run(inst.getComputationalDag()); } @@ -99,15 +97,15 @@ template void MultilevelCoarser::clear_computation_data() { dag_history.clear(); dag_history.shrink_to_fit(); - + contraction_maps.clear(); contraction_maps.shrink_to_fit(); } - template void MultilevelCoarser::compactify_dag_history() { - if (dag_history.size() < 3) return; + if (dag_history.size() < 3) + return; size_t dag_indx_first = dag_history.size() - 2; size_t map_indx_first = contraction_maps.size() - 2; @@ -115,13 +113,13 @@ void MultilevelCoarser::compactify_dag_history() { size_t dag_indx_second = dag_history.size() - 1; size_t map_indx_second = contraction_maps.size() - 1; - if ( (static_cast( dag_history[dag_indx_first-1]->num_vertices() ) / static_cast( dag_history[dag_indx_second-1]->num_vertices() )) > 1.25 ) return; - + if ((static_cast(dag_history[dag_indx_first - 1]->num_vertices()) / static_cast(dag_history[dag_indx_second - 1]->num_vertices())) > 1.25) + return; // Compute combined contraction_map - std::unique_ptr>> combi_contraction_map = std::make_unique>>( contraction_maps[map_indx_first]->size() ); + std::unique_ptr>> combi_contraction_map = std::make_unique>>(contraction_maps[map_indx_first]->size()); for (std::size_t vert = 0; vert < contraction_maps[map_indx_first]->size(); ++vert) { - combi_contraction_map->at(vert) = contraction_maps[map_indx_second]->at( contraction_maps[map_indx_first]->at( vert ) ); + combi_contraction_map->at(vert) = contraction_maps[map_indx_second]->at(contraction_maps[map_indx_first]->at(vert)); } // Delete ComputationalDag @@ -138,7 +136,6 @@ void MultilevelCoarser::compactify_dag_history() { contraction_maps[map_indx_first] = std::move(combi_contraction_map); } - template RETURN_STATUS MultilevelCoarser::add_contraction(const std::vector> &contraction_map) { std::unique_ptr new_graph = std::make_unique(); @@ -148,12 +145,12 @@ RETURN_STATUS MultilevelCoarser::add_contraction(const bool success = false; if (dag_history.size() == 0) { - success = coarser_util::construct_coarse_dag(*(getOriginalGraph()), *new_graph, *(contraction_maps.back()) ); + success = coarser_util::construct_coarse_dag(*(getOriginalGraph()), *new_graph, *(contraction_maps.back())); } else { - success = coarser_util::construct_coarse_dag(*(dag_history.back()), *new_graph, *(contraction_maps.back()) ); + success = coarser_util::construct_coarse_dag(*(dag_history.back()), *new_graph, *(contraction_maps.back())); } - dag_history.emplace_back( std::move(new_graph) ); + dag_history.emplace_back(std::move(new_graph)); if (success) { compactify_dag_history(); @@ -166,19 +163,19 @@ RETURN_STATUS MultilevelCoarser::add_contraction(const template RETURN_STATUS MultilevelCoarser::add_contraction(std::vector> &&contraction_map) { std::unique_ptr new_graph = std::make_unique(); - + std::unique_ptr>> contr_map_ptr(new std::vector>(std::move(contraction_map))); contraction_maps.emplace_back(std::move(contr_map_ptr)); bool success = false; if (dag_history.size() == 0) { - success = coarser_util::construct_coarse_dag(*(getOriginalGraph()), *new_graph, *(contraction_maps.back()) ); + success = coarser_util::construct_coarse_dag(*(getOriginalGraph()), *new_graph, *(contraction_maps.back())); } else { - success = coarser_util::construct_coarse_dag(*(dag_history.back()), *new_graph, *(contraction_maps.back()) ); + success = coarser_util::construct_coarse_dag(*(dag_history.back()), *new_graph, *(contraction_maps.back())); } - dag_history.emplace_back( std::move(new_graph) ); + dag_history.emplace_back(std::move(new_graph)); if (success) { compactify_dag_history(); @@ -188,12 +185,11 @@ RETURN_STATUS MultilevelCoarser::add_contraction(std::v } } - template RETURN_STATUS MultilevelCoarser::add_contraction(const std::vector> &contraction_map, const Graph_t_coarse &contracted_graph) { std::unique_ptr graph_ptr(new Graph_t_coarse(contracted_graph)); dag_history.emplace_back(std::move(graph_ptr)); - + std::unique_ptr>> contr_map_ptr(new std::vector>(contraction_map)); contraction_maps.emplace_back(std::move(contr_map_ptr)); @@ -213,7 +209,6 @@ RETURN_STATUS MultilevelCoarser::add_contraction(std::v return RETURN_STATUS::OSP_SUCCESS; } - template std::vector> MultilevelCoarser::getCombinedContractionMap() const { std::vector> combinedContractionMap(original_graph->num_vertices()); @@ -221,23 +216,22 @@ std::vector> MultilevelCoarserat( combinedContractionMap[i] ); + combinedContractionMap[i] = contraction_maps[j]->at(combinedContractionMap[i]); } } return combinedContractionMap; } - - template bool MultilevelCoarser::coarsenDag(const Graph_t &dag_in, Graph_t_coarse &coarsened_dag, - std::vector> &vertex_contraction_map) { + std::vector> &vertex_contraction_map) { clear_computation_data(); RETURN_STATUS status = run(dag_in); - if (status != RETURN_STATUS::OSP_SUCCESS && status != RETURN_STATUS::BEST_FOUND) return false; + if (status != RETURN_STATUS::OSP_SUCCESS && status != RETURN_STATUS::BEST_FOUND) + return false; assert(dag_history.size() != 0); coarsened_dag = *(dag_history.back()); @@ -251,20 +245,16 @@ template void MultilevelCoarser::add_identity_contraction() { std::size_t n_vert; if (dag_history.size() == 0) { - n_vert = static_cast( original_graph->num_vertices() ); + n_vert = static_cast(original_graph->num_vertices()); } else { - n_vert = static_cast( dag_history.back()->num_vertices() ); + n_vert = static_cast(dag_history.back()->num_vertices()); } - - std::vector> contraction_map( n_vert ); + + std::vector> contraction_map(n_vert); std::iota(contraction_map.begin(), contraction_map.end(), 0); add_contraction(std::move(contraction_map)); compactify_dag_history(); } - - - - } // end namespace osp \ No newline at end of file diff --git a/include/osp/dag_divider/AbstractWavefrontScheduler.hpp b/include/osp/dag_divider/AbstractWavefrontScheduler.hpp index 556e82bc..69a3c80c 100644 --- a/include/osp/dag_divider/AbstractWavefrontScheduler.hpp +++ b/include/osp/dag_divider/AbstractWavefrontScheduler.hpp @@ -21,10 +21,10 @@ limitations under the License. #include "osp/graph_algorithms/computational_dag_util.hpp" #include "osp/graph_algorithms/subgraph_algorithms.hpp" #include "osp/graph_implementations/boost_graphs/boost_graph.hpp" -#include #include -#include #include +#include +#include namespace osp { @@ -34,7 +34,7 @@ namespace osp { */ template class AbstractWavefrontScheduler : public Scheduler { -protected: + protected: IDagDivider *divider; Scheduler *scheduler; static constexpr bool enable_debug_prints = true; @@ -46,17 +46,17 @@ class AbstractWavefrontScheduler : public Scheduler { */ bool distributeProcessors( unsigned total_processors_of_type, - const std::vector& work_weights, - std::vector& allocation) const { - + const std::vector &work_weights, + std::vector &allocation) const { + allocation.assign(work_weights.size(), 0); double total_work = std::accumulate(work_weights.begin(), work_weights.end(), 0.0); if (total_work <= 1e-9 || total_processors_of_type == 0) { return false; } - + std::vector active_indices; - for(size_t i = 0; i < work_weights.size(); ++i) { + for (size_t i = 0; i < work_weights.size(); ++i) { if (work_weights[i] > 1e-9) { active_indices.push_back(i); } @@ -68,7 +68,7 @@ class AbstractWavefrontScheduler : public Scheduler { size_t num_active_components = active_indices.size(); unsigned remaining_procs = total_processors_of_type; - + // --- Stage 1: Guarantee at least one processor if possible (anti-starvation) --- if (total_processors_of_type >= num_active_components) { // Abundance case: Give one processor to each active component first. @@ -79,11 +79,11 @@ class AbstractWavefrontScheduler : public Scheduler { } else { // Scarcity case: Not enough processors for each active component. std::vector> sorted_work; - for(size_t idx : active_indices) { + for (size_t idx : active_indices) { sorted_work.push_back({work_weights[idx], idx}); } std::sort(sorted_work.rbegin(), sorted_work.rend()); - for(unsigned i = 0; i < remaining_procs; ++i) { + for (unsigned i = 0; i < remaining_procs; ++i) { allocation[sorted_work[i].second]++; } return true; // Scarcity case was hit. @@ -93,10 +93,10 @@ class AbstractWavefrontScheduler : public Scheduler { if (remaining_procs > 0) { std::vector adjusted_work_weights; double adjusted_total_work = 0; - + double work_per_proc = total_work / static_cast(total_processors_of_type); - for(size_t idx : active_indices) { + for (size_t idx : active_indices) { double adjusted_work = std::max(0.0, work_weights[idx] - work_per_proc); adjusted_work_weights.push_back(adjusted_work); adjusted_total_work += adjusted_work; @@ -123,14 +123,13 @@ class AbstractWavefrontScheduler : public Scheduler { } } } - } + } return false; // Scarcity case was not hit. } - BspArchitecture createSubArchitecture( const BspArchitecture &original_arch, - const std::vector& sub_dag_proc_types) const { + const std::vector &sub_dag_proc_types) const { // The calculation is now inside the assert, so it only happens in debug builds. assert(std::accumulate(sub_dag_proc_types.begin(), sub_dag_proc_types.end(), 0u) > 0 && "Attempted to create a sub-architecture with zero processors."); @@ -142,33 +141,34 @@ class AbstractWavefrontScheduler : public Scheduler { sub_dag_processor_memory[original_arch.processorType(i)] = std::min(original_arch.memoryBound(i), sub_dag_processor_memory[original_arch.processorType(i)]); } - sub_architecture.set_processors_consequ_types(sub_dag_proc_types, sub_dag_processor_memory); + sub_architecture.SetProcessorsConsequTypes(sub_dag_proc_types, sub_dag_processor_memory); return sub_architecture; } - bool validateWorkDistribution(const std::vector& sub_dags, const BspInstance& instance) const { - const auto& original_arch = instance.getArchitecture(); - for (const auto& rep_sub_dag : sub_dags) { + bool validateWorkDistribution(const std::vector &sub_dags, const BspInstance &instance) const { + const auto &original_arch = instance.getArchitecture(); + for (const auto &rep_sub_dag : sub_dags) { const double total_rep_work = sumOfVerticesWorkWeights(rep_sub_dag); - + double sum_of_compatible_works_for_rep = 0.0; for (unsigned type_idx = 0; type_idx < original_arch.getNumberOfProcessorTypes(); ++type_idx) { sum_of_compatible_works_for_rep += sumOfCompatibleWorkWeights(rep_sub_dag, instance, type_idx); } if (sum_of_compatible_works_for_rep > total_rep_work + 1e-9) { - if constexpr (enable_debug_prints) std::cerr << "ERROR: Sum of compatible work (" << sum_of_compatible_works_for_rep - << ") exceeds total work (" << total_rep_work - << ") for a sub-dag. Aborting." << std::endl; + if constexpr (enable_debug_prints) + std::cerr << "ERROR: Sum of compatible work (" << sum_of_compatible_works_for_rep + << ") exceeds total work (" << total_rep_work + << ") for a sub-dag. Aborting." << std::endl; return false; } } return true; } -public: + public: AbstractWavefrontScheduler(IDagDivider &div, Scheduler &sched) : divider(&div), scheduler(&sched) {} }; -} +} // namespace osp diff --git a/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp b/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp index 5ba326d9..83556089 100644 --- a/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp +++ b/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp @@ -16,22 +16,22 @@ limitations under the License. @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner */ -#include -#include -#include -#include -#include "OrbitGraphProcessor.hpp" +#include "EftSubgraphScheduler.hpp" #include "HashComputer.hpp" #include "MerkleHashComputer.hpp" -#include "EftSubgraphScheduler.hpp" +#include "OrbitGraphProcessor.hpp" #include "TrimmedGroupScheduler.hpp" #include "osp/auxiliary/io/DotFileWriter.hpp" #include "osp/bsp/scheduler/Scheduler.hpp" #include "osp/graph_algorithms/subgraph_algorithms.hpp" +#include +#include +#include +#include namespace osp { -/** +/** * @brief A scheduler that leverages isomorphic subgraphs to partition a DAG. * * @class IsomorphicSubgraphScheduler @@ -58,12 +58,11 @@ class IsomorphicSubgraphScheduler { static_assert(std::is_same_v, vertex_idx_t>, "Graph_t and Constr_Graph_t must have the same vertex_idx types"); - private: - - static constexpr bool verbose = false; - const HashComputer>* hash_computer_; + private: + static constexpr bool verbose = false; + const HashComputer> *hash_computer_; size_t symmetry_ = 4; - Scheduler * bsp_scheduler_; + Scheduler *bsp_scheduler_; bool use_max_group_size_ = false; unsigned max_group_size_ = 0; bool plot_dot_graphs_ = false; @@ -76,22 +75,21 @@ class IsomorphicSubgraphScheduler { bool use_max_bsp = false; bool use_adaptive_symmetry_threshold = true; - public: - - explicit IsomorphicSubgraphScheduler(Scheduler & bsp_scheduler) + public: + explicit IsomorphicSubgraphScheduler(Scheduler &bsp_scheduler) : hash_computer_(nullptr), bsp_scheduler_(&bsp_scheduler), plot_dot_graphs_(false) {} - IsomorphicSubgraphScheduler(Scheduler & bsp_scheduler, const HashComputer>& hash_computer) + IsomorphicSubgraphScheduler(Scheduler &bsp_scheduler, const HashComputer> &hash_computer) : hash_computer_(&hash_computer), bsp_scheduler_(&bsp_scheduler), plot_dot_graphs_(false) {} virtual ~IsomorphicSubgraphScheduler() {} - void setMergeDifferentTypes(bool flag) {merge_different_node_types = flag;} - void setWorkThreshold(v_workw_t work_threshold) {work_threshold_ = work_threshold;} - void setCriticalPathThreshold(v_workw_t critical_path_threshold) {critical_path_threshold_ = critical_path_threshold;} - void setOrbitLockRatio(double orbit_lock_ratio) {orbit_lock_ratio_ = orbit_lock_ratio;} - void setNaturalBreaksCountPercentage(double natural_breaks_count_percentage) {natural_breaks_count_percentage_ = natural_breaks_count_percentage;} - void setAllowTrimmedScheduler(bool flag) {allow_use_trimmed_scheduler = flag;} + void setMergeDifferentTypes(bool flag) { merge_different_node_types = flag; } + void setWorkThreshold(v_workw_t work_threshold) { work_threshold_ = work_threshold; } + void setCriticalPathThreshold(v_workw_t critical_path_threshold) { critical_path_threshold_ = critical_path_threshold; } + void setOrbitLockRatio(double orbit_lock_ratio) { orbit_lock_ratio_ = orbit_lock_ratio; } + void setNaturalBreaksCountPercentage(double natural_breaks_count_percentage) { natural_breaks_count_percentage_ = natural_breaks_count_percentage; } + void setAllowTrimmedScheduler(bool flag) { allow_use_trimmed_scheduler = flag; } void set_plot_dot_graphs(bool plot) { plot_dot_graphs_ = plot; } void disable_use_max_group_size() { use_max_group_size_ = false; } void setUseMaxBsp(bool flag) { use_max_bsp = flag; } @@ -100,12 +98,12 @@ class IsomorphicSubgraphScheduler { max_group_size_ = max_group_size; } void setEnableAdaptiveSymmetryThreshold() { use_adaptive_symmetry_threshold = true; } - void setUseStaticSymmetryLevel(size_t static_symmetry_level) { - use_adaptive_symmetry_threshold = false; - symmetry_ = static_symmetry_level; + void setUseStaticSymmetryLevel(size_t static_symmetry_level) { + use_adaptive_symmetry_threshold = false; + symmetry_ = static_symmetry_level; } - std::vector> compute_partition(const BspInstance& instance) { + std::vector> compute_partition(const BspInstance &instance) { OrbitGraphProcessor orbit_processor; orbit_processor.set_work_threshold(work_threshold_); orbit_processor.setMergeDifferentNodeTypes(merge_different_node_types); @@ -116,7 +114,7 @@ class IsomorphicSubgraphScheduler { orbit_processor.setUseStaticSymmetryLevel(symmetry_); } - std::unique_ptr>> local_hasher; + std::unique_ptr>> local_hasher; if (!hash_computer_) { local_hasher = std::make_unique, true>>(instance.getComputationalDag(), instance.getComputationalDag()); hash_computer_ = local_hasher.get(); @@ -125,7 +123,7 @@ class IsomorphicSubgraphScheduler { orbit_processor.discover_isomorphic_groups(instance.getComputationalDag(), *hash_computer_); auto isomorphic_groups = orbit_processor.get_final_groups(); - + std::vector was_trimmed(isomorphic_groups.size(), false); trim_subgraph_groups(isomorphic_groups, instance, was_trimmed); // Apply trimming and record which groups were affected @@ -157,8 +155,7 @@ class IsomorphicSubgraphScheduler { return partition; } - protected: - + protected: template struct subgraph_scheduler_input { BspInstance instance; @@ -167,14 +164,14 @@ class IsomorphicSubgraphScheduler { std::vector>> required_proc_types; }; - void trim_subgraph_groups(std::vector::Group>& isomorphic_groups, - const BspInstance& instance, - std::vector& was_trimmed) { + void trim_subgraph_groups(std::vector::Group> &isomorphic_groups, + const BspInstance &instance, + std::vector &was_trimmed) { if constexpr (verbose) { std::cout << "\n--- Trimming Isomorphic Subgraph Groups ---" << std::endl; } for (size_t group_idx = 0; group_idx < isomorphic_groups.size(); ++group_idx) { - auto& group = isomorphic_groups[group_idx]; + auto &group = isomorphic_groups[group_idx]; const unsigned group_size = static_cast(group.size()); if (group_size <= 1) continue; @@ -194,24 +191,24 @@ class IsomorphicSubgraphScheduler { if constexpr (has_typed_vertices_v) { if (!group.subgraphs.empty() && !group.subgraphs[0].empty()) { common_node_type = instance.getComputationalDag().vertex_type(group.subgraphs[0][0]); - const auto& rep_subgraph = group.subgraphs[0]; - for (const auto& vertex : rep_subgraph) { + const auto &rep_subgraph = group.subgraphs[0]; + for (const auto &vertex : rep_subgraph) { if (instance.getComputationalDag().vertex_type(vertex) != common_node_type) { is_single_type_group = false; break; } } } else { - is_single_type_group = false; + is_single_type_group = false; } } else { - is_single_type_group = false; + is_single_type_group = false; } if (is_single_type_group) { // Dynamically determine min_proc_type_count based on compatible processors for this type unsigned min_compatible_processors = std::numeric_limits::max(); - const auto& proc_type_counts = instance.getArchitecture().getProcessorTypeCount(); + const auto &proc_type_counts = instance.getArchitecture().getProcessorTypeCount(); bool found_compatible_processor = false; for (unsigned proc_type_idx = 0; proc_type_idx < proc_type_counts.size(); ++proc_type_idx) { @@ -222,13 +219,13 @@ class IsomorphicSubgraphScheduler { } if (found_compatible_processor) { if constexpr (verbose) { - std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type + std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type << "). Min compatible processors: " << min_compatible_processors << "." << std::endl; } effective_min_proc_type_count = min_compatible_processors; } else { if constexpr (verbose) { - std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type + std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type << ") but no compatible processors found. Disabling trimming." << std::endl; } // If no compatible processors found for this type, effectively disable trimming for this group. @@ -236,7 +233,11 @@ class IsomorphicSubgraphScheduler { } } else { // Fallback to a default min_proc_type_count if not a single-type group or no typed vertices. - effective_min_proc_type_count = instance.getArchitecture().getMinProcessorTypeCount(); + const auto &type_count = instance.getArchitecture().getProcessorTypeCount(); + if (type_count.empty()) { + effective_min_proc_type_count = 0; + } + effective_min_proc_type_count = *std::min_element(type_count.begin(), type_count.end()); if constexpr (verbose) { std::cout << "Group " << group_idx << " (size " << group_size << "): Multi-type or untyped group. Using default min_proc_type_count: " << effective_min_proc_type_count << "." << std::endl; } @@ -257,13 +258,13 @@ class IsomorphicSubgraphScheduler { if (gcd < group_size) { if constexpr (verbose) { - std::cout << " -> Trimming group " << group_idx << ". GCD(" << group_size << ", " << effective_min_proc_type_count + std::cout << " -> Trimming group " << group_idx << ". GCD(" << group_size << ", " << effective_min_proc_type_count << ") = " << gcd << ". Merging " << group_size / gcd << " subgraphs at a time." << std::endl; } if (allow_use_trimmed_scheduler) gcd = 1; - + was_trimmed[group_idx] = true; const unsigned merge_size = group_size / gcd; std::vector>> new_subgraphs; @@ -279,7 +280,7 @@ class IsomorphicSubgraphScheduler { } for (unsigned k = 0; k < merge_size; ++k) { - const auto& sg_to_merge_vertices = group.subgraphs[original_sg_cursor]; + const auto &sg_to_merge_vertices = group.subgraphs[original_sg_cursor]; original_sg_cursor++; merged_sg_vertices.insert(merged_sg_vertices.end(), sg_to_merge_vertices.begin(), sg_to_merge_vertices.end()); } @@ -292,16 +293,16 @@ class IsomorphicSubgraphScheduler { } was_trimmed[group_idx] = false; } - } + } } subgraph_scheduler_input prepare_subgraph_scheduling_input( - const BspInstance& original_instance, - const std::vector::Group>& isomorphic_groups, - const std::vector& was_trimmed) { - + const BspInstance &original_instance, + const std::vector::Group> &isomorphic_groups, + const std::vector &was_trimmed) { + subgraph_scheduler_input result; - result.instance.setArchitecture(original_instance.getArchitecture()); + result.instance.getArchitecture() = original_instance.getArchitecture(); const unsigned num_proc_types = original_instance.getArchitecture().getNumberOfProcessorTypes(); result.multiplicities.resize(isomorphic_groups.size()); @@ -332,35 +333,35 @@ class IsomorphicSubgraphScheduler { ++coarse_node_idx; } coarser_util::construct_coarse_dag(original_instance.getComputationalDag(), result.instance.getComputationalDag(), - contraction_map); + contraction_map); if constexpr (verbose) { std::cout << "\n--- Preparing Subgraph Scheduling Input ---\n"; std::cout << "Found " << isomorphic_groups.size() << " isomorphic groups to schedule as coarse nodes.\n"; for (size_t j = 0; j < isomorphic_groups.size(); ++j) { std::cout << " - Coarse Node " << j << " (from " << isomorphic_groups[j].subgraphs.size() - << " isomorphic subgraphs):\n"; + << " isomorphic subgraphs):\n"; std::cout << " - Multiplicity for scheduling: " << result.multiplicities[j] << "\n"; std::cout << " - Total Work (in coarse graph): " << result.instance.getComputationalDag().vertex_work_weight(j) << "\n"; std::cout << " - Required Processor Types: "; for (unsigned k = 0; k < num_proc_types; ++k) { std::cout << result.required_proc_types[j][k] << " "; } - std::cout << "\n"; + std::cout << "\n"; std::cout << " - Max number of processors: " << result.max_num_processors[j] << "\n"; } } return result; } - void schedule_isomorphic_group(const BspInstance& instance, - const std::vector::Group>& isomorphic_groups, - const SubgraphSchedule & sub_sched, - std::vector> & partition) { + void schedule_isomorphic_group(const BspInstance &instance, + const std::vector::Group> &isomorphic_groups, + const SubgraphSchedule &sub_sched, + std::vector> &partition) { vertex_idx_t current_partition_idx = 0; for (size_t group_idx = 0; group_idx < isomorphic_groups.size(); ++group_idx) { - const auto& group = isomorphic_groups[group_idx]; + const auto &group = isomorphic_groups[group_idx]; if (group.subgraphs.empty()) { continue; } @@ -372,48 +373,48 @@ class IsomorphicSubgraphScheduler { BspInstance representative_instance; auto rep_global_to_local_map = create_induced_subgraph_map(instance.getComputationalDag(), representative_instance.getComputationalDag(), rep_subgraph_vertices_sorted); - representative_instance.setArchitecture(instance.getArchitecture()); - const auto& procs_for_group = sub_sched.node_assigned_worker_per_type[group_idx]; + representative_instance.getArchitecture() = instance.getArchitecture(); + const auto &procs_for_group = sub_sched.node_assigned_worker_per_type[group_idx]; std::vector> mem_weights(procs_for_group.size(), 0); for (unsigned proc_type = 0; proc_type < procs_for_group.size(); ++proc_type) { mem_weights[proc_type] = static_cast>(instance.getArchitecture().maxMemoryBoundProcType(proc_type)); } - representative_instance.getArchitecture().set_processors_consequ_types(procs_for_group, mem_weights); + representative_instance.getArchitecture().SetProcessorsConsequTypes(procs_for_group, mem_weights); representative_instance.setNodeProcessorCompatibility(instance.getProcessorCompatibilityMatrix()); // --- Decide which scheduler to use --- unsigned min_non_zero_procs = std::numeric_limits::max(); - for (const auto& proc_count : procs_for_group) { + for (const auto &proc_count : procs_for_group) { if (proc_count > 0) { min_non_zero_procs = std::min(min_non_zero_procs, proc_count); } } - bool use_trimmed_scheduler = sub_sched.was_trimmed[group_idx] && min_non_zero_procs > 1 && allow_use_trimmed_scheduler; - - Scheduler* scheduler_for_group_ptr; + + Scheduler *scheduler_for_group_ptr; std::unique_ptr> trimmed_scheduler_owner; if (use_trimmed_scheduler) { - if constexpr (verbose) std::cout << "Using TrimmedGroupScheduler for group " << group_idx << std::endl; + if constexpr (verbose) + std::cout << "Using TrimmedGroupScheduler for group " << group_idx << std::endl; trimmed_scheduler_owner = std::make_unique>(*bsp_scheduler_, min_non_zero_procs); scheduler_for_group_ptr = trimmed_scheduler_owner.get(); } else { - if constexpr (verbose) std::cout << "Using standard BSP scheduler for group " << group_idx << std::endl; + if constexpr (verbose) + std::cout << "Using standard BSP scheduler for group " << group_idx << std::endl; scheduler_for_group_ptr = bsp_scheduler_; } - // --- Schedule the representative to get the pattern --- BspSchedule bsp_schedule(representative_instance); if constexpr (verbose) { std::cout << "--- Scheduling representative for group " << group_idx << " ---" << std::endl; std::cout << " Number of subgraphs in group: " << group.subgraphs.size() << std::endl; - const auto& rep_dag = representative_instance.getComputationalDag(); + const auto &rep_dag = representative_instance.getComputationalDag(); std::cout << " Representative subgraph size: " << rep_dag.num_vertices() << " vertices" << std::endl; std::vector node_type_counts(rep_dag.num_vertex_types(), 0); - for (const auto& v : rep_dag.vertices()) { + for (const auto &v : rep_dag.vertices()) { node_type_counts[rep_dag.vertex_type(v)]++; } std::cout << " Node type counts: "; @@ -424,45 +425,34 @@ class IsomorphicSubgraphScheduler { } std::cout << std::endl; - const auto& sub_arch = representative_instance.getArchitecture(); + const auto &sub_arch = representative_instance.getArchitecture(); std::cout << " Sub-architecture for scheduling:" << std::endl; std::cout << " Processors: " << sub_arch.numberOfProcessors() << std::endl; std::cout << " Processor types counts: "; - const auto& type_counts = sub_arch.getProcessorTypeCount(); + const auto &type_counts = sub_arch.getProcessorTypeCount(); for (size_t type_idx = 0; type_idx < type_counts.size(); ++type_idx) { std::cout << "T" << type_idx << ":" << type_counts[type_idx] << " "; } std::cout << std::endl; std::cout << " Sync cost: " << sub_arch.synchronisationCosts() << ", Comm cost: " << sub_arch.communicationCosts() << std::endl; - std::cout << " Sub-problem compatibility matrix:" << std::endl; - const auto & sub_comp_matrix = representative_instance.getNodeNodeCompatabilityMatrix(); - for(unsigned i = 0; i < sub_comp_matrix.size(); ++i) { - std::cout << " Node Type " << i << ": [ "; - for (unsigned j = 0; j < sub_comp_matrix[i].size(); ++j) { - std::cout << (sub_comp_matrix[i][j] ? "1" : "0") << " "; - } - std::cout << "]" << std::endl; - } - } - + scheduler_for_group_ptr->computeSchedule(bsp_schedule); if constexpr (verbose) { - std::cout << " Schedule satisfies precedence constraints: "; + std::cout << " Schedule satisfies precedence constraints: "; std::cout << bsp_schedule.satisfiesPrecedenceConstraints() << std::endl; std::cout << " Schedule satisfies node type constraints: "; std::cout << bsp_schedule.satisfiesNodeTypeConstraints() << std::endl; } - if (plot_dot_graphs_) { - const auto& rep_dag = bsp_schedule.getInstance().getComputationalDag(); + const auto &rep_dag = bsp_schedule.getInstance().getComputationalDag(); std::vector colors(rep_dag.num_vertices()); std::map, unsigned> proc_ss_to_color; unsigned next_color = 0; - for (const auto& v : rep_dag.vertices()) { + for (const auto &v : rep_dag.vertices()) { const auto assignment = std::make_pair(bsp_schedule.assignedProcessor(v), bsp_schedule.assignedSuperstep(v)); if (proc_ss_to_color.find(assignment) == proc_ss_to_color.end()) { proc_ss_to_color[assignment] = next_color++; @@ -476,12 +466,10 @@ class IsomorphicSubgraphScheduler { ss << std::put_time(std::localtime(&in_time_t), "%Y%m%d_%H%M%S"); std::string timestamp = ss.str() + "_"; - DotFileWriter writer; writer.write_colored_graph(timestamp + "iso_group_rep_" + std::to_string(group_idx) + ".dot", rep_dag, colors); } - const bool max_bsp = use_max_bsp && (representative_instance.getComputationalDag().num_edges() == 0) && (representative_instance.getComputationalDag().vertex_type(0) == 0); // Build data structures for applying the pattern --- @@ -491,10 +479,9 @@ class IsomorphicSubgraphScheduler { for (vertex_idx_t j = 0; j < static_cast>(rep_subgraph_vertices_sorted.size()); ++j) { auto sp_pair = std::make_pair(bsp_schedule.assignedSuperstep(j), bsp_schedule.assignedProcessor(j)); - if (max_bsp) + if (max_bsp) sp_pair = std::make_pair(j, 0); - if (sp_proc_to_relative_partition.find(sp_pair) == sp_proc_to_relative_partition.end()) { sp_proc_to_relative_partition[sp_pair] = num_partitions_per_subgraph++; } @@ -516,12 +503,12 @@ class IsomorphicSubgraphScheduler { } else { // For other subgraphs, build the isomorphic mapping Constr_Graph_t current_subgraph_graph; create_induced_subgraph(instance.getComputationalDag(), current_subgraph_graph, current_subgraph_vertices_sorted); - + MerkleHashComputer current_hasher(current_subgraph_graph); - for(const auto& [hash, rep_orbit_nodes] : rep_hasher.get_orbits()) { - const auto& current_orbit_nodes = current_hasher.get_orbit_from_hash(hash); - for(size_t k = 0; k < rep_orbit_nodes.size(); ++k) { + for (const auto &[hash, rep_orbit_nodes] : rep_hasher.get_orbits()) { + const auto ¤t_orbit_nodes = current_hasher.get_orbit_from_hash(hash); + for (size_t k = 0; k < rep_orbit_nodes.size(); ++k) { // Map: current_subgraph_vertex -> representative_subgraph_local_idx current_vertex_to_rep_local_idx[current_subgraph_vertices_sorted[current_orbit_nodes[k]]] = static_cast>(rep_orbit_nodes[k]); } @@ -529,11 +516,11 @@ class IsomorphicSubgraphScheduler { } // Apply the partition pattern - for (const auto& current_vertex : current_subgraph_vertices_sorted) { + for (const auto ¤t_vertex : current_subgraph_vertices_sorted) { const auto rep_local_idx = current_vertex_to_rep_local_idx.at(current_vertex); auto sp_pair = std::make_pair(bsp_schedule.assignedSuperstep(rep_local_idx), bsp_schedule.assignedProcessor(rep_local_idx)); - if (max_bsp) + if (max_bsp) sp_pair = std::make_pair(rep_local_idx, 0); partition[current_vertex] = current_partition_idx + sp_proc_to_relative_partition.at(sp_pair); @@ -544,4 +531,4 @@ class IsomorphicSubgraphScheduler { } }; -} \ No newline at end of file +} // namespace osp \ No newline at end of file diff --git a/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp b/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp index 0b125e71..97fa53a5 100644 --- a/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp +++ b/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp @@ -19,8 +19,8 @@ limitations under the License. #pragma once #include "osp/bsp/scheduler/Scheduler.hpp" -#include "osp/graph_algorithms/subgraph_algorithms.hpp" #include "osp/graph_algorithms/computational_dag_util.hpp" +#include "osp/graph_algorithms/subgraph_algorithms.hpp" #include #include @@ -35,7 +35,7 @@ namespace osp { * potentially disconnected, subgraph that resulted from merging smaller isomorphic subgraphs. It divides * the input graph into its weakly connected components and schedules them on proportionally allocated processors. */ -template +template class TrimmedGroupScheduler : public Scheduler { Scheduler *sub_scheduler; @@ -94,7 +94,7 @@ class TrimmedGroupScheduler : public Scheduler { // Determine the processor allocation for a single sub-problem. // Calculate offsets for processor types within the main 'arch' (passed to TrimmedGroupScheduler) std::vector arch_proc_type_offsets(arch.getNumberOfProcessorTypes(), 0); - const auto& arch_proc_type_counts = arch.getProcessorTypeCount(); + const auto &arch_proc_type_counts = arch.getProcessorTypeCount(); for (unsigned type_idx = 1; type_idx < arch.getNumberOfProcessorTypes(); ++type_idx) { arch_proc_type_offsets[type_idx] = arch_proc_type_offsets[type_idx - 1] + arch_proc_type_counts[type_idx - 1]; } @@ -115,12 +115,12 @@ class TrimmedGroupScheduler : public Scheduler { } // Create the sub-architecture for one sub-problem. - BspArchitecture sub_arch(arch); - sub_arch.set_processors_consequ_types(sub_proc_counts, mem_weights); + BspArchitecture sub_arch(arch); + sub_arch.SetProcessorsConsequTypes(sub_proc_counts, mem_weights); // Calculate offsets for processor types within the 'sub_arch' std::vector sub_arch_proc_type_offsets(sub_arch.getNumberOfProcessorTypes(), 0); - const auto& sub_arch_proc_type_counts = sub_arch.getProcessorTypeCount(); + const auto &sub_arch_proc_type_counts = sub_arch.getProcessorTypeCount(); for (unsigned type_idx = 1; type_idx < sub_arch.getNumberOfProcessorTypes(); ++type_idx) { sub_arch_proc_type_offsets[type_idx] = sub_arch_proc_type_offsets[type_idx - 1] + sub_arch_proc_type_counts[type_idx - 1]; } @@ -135,8 +135,8 @@ class TrimmedGroupScheduler : public Scheduler { std::sort(group_vertices.begin(), group_vertices.end()); BspInstance sub_instanc; - sub_instanc.setArchitecture(sub_arch); // Set the sub-architecture - sub_instanc.setNodeProcessorCompatibility(instance.getNodeProcessorCompatibilityMatrix()); // Inherit compatibility + sub_instanc.getArchitecture() = sub_arch; + sub_instanc.setNodeProcessorCompatibility(instance.getNodeProcessorCompatibilityMatrix()); // Inherit compatibility auto global_to_local_map = create_induced_subgraph_map(dag, sub_instanc.getComputationalDag(), group_vertices); // Create induced subgraph // Create a schedule object for the sub-problem @@ -144,10 +144,11 @@ class TrimmedGroupScheduler : public Scheduler { // Call the sub-scheduler to compute the schedule for this group of components auto status = sub_scheduler->computeSchedule(sub_schedule); - if (status != RETURN_STATUS::OSP_SUCCESS && status != RETURN_STATUS::BEST_FOUND) return status; + if (status != RETURN_STATUS::OSP_SUCCESS && status != RETURN_STATUS::BEST_FOUND) + return status; // Map the sub-schedule back to the main schedule. - for (const auto& v_global : group_vertices) { + for (const auto &v_global : group_vertices) { const auto v_local = global_to_local_map.at(v_global); const unsigned sub_proc = sub_schedule.assignedProcessor(v_local); const unsigned sub_superstep = sub_schedule.assignedSuperstep(v_local); diff --git a/include/osp/graph_algorithms/computational_dag_construction_util.hpp b/include/osp/graph_algorithms/computational_dag_construction_util.hpp index e85217e9..553996a6 100644 --- a/include/osp/graph_algorithms/computational_dag_construction_util.hpp +++ b/include/osp/graph_algorithms/computational_dag_construction_util.hpp @@ -34,7 +34,7 @@ namespace osp { * @tparam Graph_to The type of the target graph. Must satisfy `is_constructable_cdag_vertex`. * @param from The source graph. * @param to The target graph. - */ + */ template void constructComputationalDag(const Graph_from &from, Graph_to &to) { static_assert(is_computational_dag_v, "Graph_from must satisfy the computational_dag concept"); @@ -46,21 +46,21 @@ void constructComputationalDag(const Graph_from &from, Graph_to &to) { for (const auto &v_idx : from.vertices()) { if constexpr (has_typed_vertices_v and has_typed_vertices_v) { vertex_map.push_back(to.add_vertex(from.vertex_work_weight(v_idx), from.vertex_comm_weight(v_idx), - from.vertex_mem_weight(v_idx), from.vertex_type(v_idx))); + from.vertex_mem_weight(v_idx), from.vertex_type(v_idx))); } else { vertex_map.push_back(to.add_vertex(from.vertex_work_weight(v_idx), from.vertex_comm_weight(v_idx), - from.vertex_mem_weight(v_idx))); + from.vertex_mem_weight(v_idx))); } } if constexpr (has_edge_weights_v and has_edge_weights_v) { for (const auto &e : edges(from)) { - to.add_edge(vertex_map.at(source(e, from)), vertex_map.at(target(e, from)), from.edge_comm_weight(e)); + to.add_edge(vertex_map[source(e, from)], vertex_map[target(e, from)], from.edge_comm_weight(e)); } } else { for (const auto &v : from.vertices()) { for (const auto &child : from.children(v)) { - to.add_edge(vertex_map.at(v), vertex_map.at(child)); + to.add_edge(vertex_map[v], vertex_map[child]); } } } diff --git a/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp b/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp index 0b67ab30..616aea6b 100644 --- a/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp +++ b/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp @@ -17,6 +17,8 @@ limitations under the License. */ #pragma once +#include // for std::size_t + namespace osp { /** @@ -71,17 +73,17 @@ struct cdag_vertex_impl { }; /** - * @brief A vertex implementation with integer weights. Indexed by size_t. Node types are unsigned. + * @brief A vertex implementation with integer weights. Indexed by std::size_t. Node types are unsigned. * * This struct implements a vertex with integer weights for work, communication, and memory. */ -using cdag_vertex_impl_int = cdag_vertex_impl; +using cdag_vertex_impl_int = cdag_vertex_impl; /** - * @brief A vertex implementation with unsigned weights. Indexed by size_t. Node types are unsigned. + * @brief A vertex implementation with unsigned weights. Indexed by std::size_t. Node types are unsigned. * * This struct implements a vertex with unsigned weights for work, communication, and memory. */ -using cdag_vertex_impl_unsigned = cdag_vertex_impl; +using cdag_vertex_impl_unsigned = cdag_vertex_impl; } // namespace osp \ No newline at end of file diff --git a/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp b/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp index 74340de6..0a1b676a 100644 --- a/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp +++ b/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp @@ -32,8 +32,8 @@ namespace osp { * @brief A vector-based implementation of a computational DAG. * * This class implements a computational DAG using adjacency lists stored in two std::vectors. - * It manages the storage of vertices and edges, and provides an interface to query and modify the graph. - * + * It manages the storage of vertices and edges, and provides an interface to query and modify the graph. + * * This class satisfies the following concepts: * - `is_computational_dag_typed_vertices` * - `is_directed_graph` @@ -78,9 +78,8 @@ class computational_dag_vector_impl { explicit computational_dag_vector_impl(const vertex_idx num_vertices) : vertices_(num_vertices), out_neigbors(num_vertices), in_neigbors(num_vertices), num_edges_(0), num_vertex_types_(0) { - for (vertex_idx i = 0; i < num_vertices; ++i) { - vertices_.at(i).id = i; + vertices_[i].id = i; } } @@ -98,9 +97,7 @@ class computational_dag_vector_impl { */ template explicit computational_dag_vector_impl(const Graph_t &other) { - static_assert(is_computational_dag_v, "Graph_t must satisfy the is_computation_dag concept"); - constructComputationalDag(other, *this); } @@ -150,40 +147,40 @@ class computational_dag_vector_impl { [[nodiscard]] vertex_idx num_edges() const { return num_edges_; } /** - * @brief Returns the parents (in-neighbors) of a vertex. + * @brief Returns the parents (in-neighbors) of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] const std::vector &parents(const vertex_idx v) const { return in_neigbors.at(v); } + [[nodiscard]] const std::vector &parents(const vertex_idx v) const { return in_neigbors[v]; } /** - * @brief Returns the children (out-neighbors) of a vertex. + * @brief Returns the children (out-neighbors) of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] const std::vector &children(const vertex_idx v) const { return out_neigbors.at(v); } + [[nodiscard]] const std::vector &children(const vertex_idx v) const { return out_neigbors[v]; } /** - * @brief Returns the in-degree of a vertex. + * @brief Returns the in-degree of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast(in_neigbors.at(v).size()); } + [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast(in_neigbors[v].size()); } /** - * @brief Returns the out-degree of a vertex. + * @brief Returns the out-degree of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast(out_neigbors.at(v).size()); } + [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast(out_neigbors[v].size()); } - [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_.at(v).work_weight; } + [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_[v].work_weight; } - [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_.at(v).comm_weight; } + [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_[v].comm_weight; } - [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_.at(v).mem_weight; } + [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_[v].mem_weight; } - [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_.at(v).vertex_type; } + [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_[v].vertex_type; } [[nodiscard]] vertex_type_type num_vertex_types() const { return num_vertex_types_; } - [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_.at(v); } + [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_[v]; } /** * @brief Adds a new isolated vertex to the graph. @@ -196,7 +193,6 @@ class computational_dag_vector_impl { */ vertex_idx add_vertex(const vertex_work_weight_type work_weight, const vertex_comm_weight_type comm_weight, const vertex_mem_weight_type mem_weight, const vertex_type_type vertex_type = 0) { - vertices_.emplace_back(vertices_.size(), work_weight, comm_weight, mem_weight, vertex_type); out_neigbors.push_back({}); in_neigbors.push_back({}); @@ -231,7 +227,6 @@ class computational_dag_vector_impl { * @return True if the edge was added, false if it already exists or vertices are invalid. */ bool add_edge(const vertex_idx source, const vertex_idx target) { - if (source >= static_cast(vertices_.size()) || target >= static_cast(vertices_.size()) || source == target) return false; @@ -240,7 +235,7 @@ class computational_dag_vector_impl { return false; } - out_neigbors.at(source).push_back(target); + out_neigbors[source].push_back(target); in_neigbors.at(target).push_back(source); num_edges_++; @@ -267,7 +262,6 @@ using computational_dag_vector_impl_def_t = computational_dag_vector_impl; - static_assert(is_directed_graph_edge_desc_v>, "computational_dag_vector_impl must satisfy the directed_graph_edge_desc concept"); diff --git a/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp b/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp index 1deadcee..3ab94872 100644 --- a/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp +++ b/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp @@ -80,8 +80,8 @@ class dag_vector_adapter { dag_vector_adapter(const std::vector> &out_neigbors_, const std::vector> &in_neigbors_) : vertices_(out_neigbors_.size()), out_neigbors(&out_neigbors_), in_neigbors(&in_neigbors_), num_edges_(0), num_vertex_types_(1) { for (vertex_idx i = 0; i < static_cast(out_neigbors_.size()); ++i) { - vertices_.at(i).id = i; - num_edges_ += out_neigbors_.at(i).size(); + vertices_[i].id = i; + num_edges_ += out_neigbors_[i].size(); } } @@ -107,8 +107,8 @@ class dag_vector_adapter { num_edges_ = 0; for (vertex_idx i = 0; i < static_cast(out_neigbors->size()); ++i) { - vertices_.at(i).id = i; - num_edges_ += out_neigbors->at(i).size(); + vertices_[i].id = i; + num_edges_ += out_neigbors_[i].size(); } num_vertex_types_ = 1; @@ -130,40 +130,40 @@ class dag_vector_adapter { [[nodiscard]] vertex_idx num_edges() const { return static_cast(num_edges_); } /** - * @brief Returns a view of the parents (in-neighbors) of a vertex. + * @brief Returns a view of the parents (in-neighbors) of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] auto parents(const vertex_idx v) const { return vector_cast_view(in_neigbors->at(v)); } + [[nodiscard]] auto parents(const vertex_idx v) const { return vector_cast_view((*in_neigbors)[v]); } /** - * @brief Returns a view of the children (out-neighbors) of a vertex. + * @brief Returns a view of the children (out-neighbors) of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] auto children(const vertex_idx v) const { return vector_cast_view(out_neigbors->at(v)); } + [[nodiscard]] auto children(const vertex_idx v) const { return vector_cast_view((*out_neigbors)[v]); } /** - * @brief Returns the in-degree of a vertex. + * @brief Returns the in-degree of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast(in_neigbors->at(v).size()); } + [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast((*in_neigbors)[v].size()); } /** - * @brief Returns the out-degree of a vertex. + * @brief Returns the out-degree of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast(out_neigbors->at(v).size()); } + [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast((*out_neigbors)[v].size()); } - [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_.at(v).work_weight; } + [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_[v].work_weight; } - [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_.at(v).comm_weight; } + [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_[v].comm_weight; } - [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_.at(v).mem_weight; } + [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_[v].mem_weight; } - [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_.at(v).vertex_type; } + [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_[v].vertex_type; } [[nodiscard]] vertex_type_type num_vertex_types() const { return num_vertex_types_; } - [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_.at(v); } + [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_[v]; } void set_vertex_work_weight(const vertex_idx v, const vertex_work_weight_type work_weight) { vertices_.at(v).work_weight = work_weight; @@ -192,7 +192,6 @@ class dag_vector_adapter { unsigned num_vertex_types_ = 0; }; - static_assert(is_directed_graph_edge_desc_v>, "dag_vector_adapter must satisfy the directed_graph_edge_desc concept"); diff --git a/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp b/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp index e8fbe586..b42ea17d 100644 --- a/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp +++ b/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp @@ -141,7 +141,7 @@ class vector_cast_view { * @param i The index of the element to access. * @return The element at index i, cast to to_t. */ - [[nodiscard]] auto operator[](std::size_t i) const { return static_cast(vec.at(i)); } + [[nodiscard]] auto operator[](std::size_t i) const { return static_cast(vec[i]); } }; } // namespace osp \ No newline at end of file diff --git a/include/osp/partitioning/partitioners/partitioning_ILP.hpp b/include/osp/partitioning/partitioners/partitioning_ILP.hpp index 0482d936..2e6c4e0e 100644 --- a/include/osp/partitioning/partitioners/partitioning_ILP.hpp +++ b/include/osp/partitioning/partitioners/partitioning_ILP.hpp @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner +@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner */ #pragma once @@ -21,40 +21,39 @@ limitations under the License. #include #include -#include "osp/partitioning/partitioners/partitioning_ILP_base.hpp" +#include "osp/auxiliary/return_status.hpp" #include "osp/partitioning/model/partitioning.hpp" +#include "osp/partitioning/partitioners/partitioning_ILP_base.hpp" -namespace osp{ +namespace osp { template class HypergraphPartitioningILP : public HypergraphPartitioningILPBase { protected: - std::vector readCoptAssignment(const PartitioningProblem &instance, Model& model); + std::vector readCoptAssignment(const PartitioningProblem &instance, Model &model); - void setupExtraVariablesConstraints(const PartitioningProblem &instance, Model& model); + void setupExtraVariablesConstraints(const PartitioningProblem &instance, Model &model); - void setInitialSolution(const Partitioning &partition, Model& model); + void setInitialSolution(const Partitioning &partition, Model &model); public: - virtual ~HypergraphPartitioningILP() override = default; - RETURN_STATUS computePartitioning(Partitioning& result); + RETURN_STATUS computePartitioning(Partitioning &result); virtual std::string getAlgorithmName() const override { return "HypergraphPartitioningILP"; } }; template -RETURN_STATUS HypergraphPartitioningILP::computePartitioning(Partitioning& result) -{ +RETURN_STATUS HypergraphPartitioningILP::computePartitioning(Partitioning &result) { Envr env; Model model = env.CreateModel("HypergraphPart"); this->setupFundamentalVariablesConstraintsObjective(result.getInstance(), model); setupExtraVariablesConstraints(result.getInstance(), model); - if(this->use_initial_solution) + if (this->use_initial_solution) setInitialSolution(result, model); this->solveILP(model); @@ -82,7 +81,7 @@ RETURN_STATUS HypergraphPartitioningILP::computePartitioning(Parti } template -void HypergraphPartitioningILP::setupExtraVariablesConstraints(const PartitioningProblem &instance, Model& model) { +void HypergraphPartitioningILP::setupExtraVariablesConstraints(const PartitioningProblem &instance, Model &model) { using index_type = typename hypergraph_t::vertex_idx; @@ -104,19 +103,17 @@ void HypergraphPartitioningILP::setupExtraVariablesConstraints(con // hyperedge indicators match node variables for (unsigned part = 0; part < numberOfParts; part++) for (index_type node = 0; node < numberOfVertices; node++) - for (const index_type& hyperedge : instance.getHypergraph().get_incident_hyperedges(node)) + for (const index_type &hyperedge : instance.getHypergraph().get_incident_hyperedges(node)) model.AddConstr(this->hyperedge_uses_partition[hyperedge][static_cast(part)] >= this->node_in_partition[node][static_cast(part)]); - } // convert generic one-to-many assingment (of base class function) to one-to-one template -std::vector HypergraphPartitioningILP::readCoptAssignment(const PartitioningProblem &instance, Model& model) -{ +std::vector HypergraphPartitioningILP::readCoptAssignment(const PartitioningProblem &instance, Model &model) { using index_type = typename hypergraph_t::vertex_idx; std::vector node_to_partition(instance.getHypergraph().num_vertices(), std::numeric_limits::max()); - std::vector > assignmentsGenericForm = this->readAllCoptAssignments(instance, model); + std::vector> assignmentsGenericForm = this->readAllCoptAssignments(instance, model); for (index_type node = 0; node < instance.getHypergraph().num_vertices(); node++) node_to_partition[node] = assignmentsGenericForm[node].front(); @@ -125,21 +122,19 @@ std::vector HypergraphPartitioningILP::readCoptAssignmen } template -void HypergraphPartitioningILP::setInitialSolution(const Partitioning &partition, Model& model) -{ +void HypergraphPartitioningILP::setInitialSolution(const Partitioning &partition, Model &model) { using index_type = typename hypergraph_t::vertex_idx; - const std::vector& assignment = partition.assignedPartitions(); - const unsigned& numPartitions = partition.getInstance().getNumberOfPartitions(); - if(assignment.size() != partition.getInstance().getHypergraph().num_vertices()) + const std::vector &assignment = partition.assignedPartitions(); + const unsigned &numPartitions = partition.getInstance().getNumberOfPartitions(); + if (assignment.size() != partition.getInstance().getHypergraph().num_vertices()) return; - for(index_type node = 0; node < assignment.size(); ++node) - { - if(assignment[node] >= numPartitions) + for (index_type node = 0; node < assignment.size(); ++node) { + if (assignment[node] >= numPartitions) continue; - - for(unsigned part = 0; part < numPartitions; ++part) + + for (unsigned part = 0; part < numPartitions; ++part) model.SetMipStart(this->node_in_partition[node][static_cast(part)], static_cast(assignment[node] == part)); } model.LoadMipStart(); diff --git a/tests/bsp_architecture.cpp b/tests/bsp_architecture.cpp index af26e034..d803bb56 100644 --- a/tests/bsp_architecture.cpp +++ b/tests/bsp_architecture.cpp @@ -19,8 +19,8 @@ limitations under the License. #define BOOST_TEST_MODULE Bsp_Architecture #include -#include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" #include "osp/bsp/model/BspArchitecture.hpp" +#include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" using namespace osp; @@ -61,18 +61,18 @@ BOOST_AUTO_TEST_CASE(ParameterizedConstructorTest) { BOOST_CHECK_EQUAL(architecture.maxMemoryBoundProcType(0), 100); - BOOST_TEST(architecture.sendCostMatrix() == uniform_sent_costs); + BOOST_TEST(architecture.sendCost() == uniform_sent_costs); std::vector> expectedSendCosts = {{0, 2, 2, 2}, {2, 0, 2, 2}, {2, 2, 0, 2}, {2, 2, 2, 0}}; - architecture.setSendCosts(expectedSendCosts); - BOOST_TEST(architecture.sendCostMatrix() == expectedSendCosts); + architecture.SetSendCosts(expectedSendCosts); + BOOST_TEST(architecture.sendCost() == expectedSendCosts); BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 1), 4); BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 0), 0); architecture.SetUniformSendCost(); - BOOST_TEST(architecture.sendCostMatrix() == uniform_sent_costs); + BOOST_TEST(architecture.sendCost() == uniform_sent_costs); BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 1), 2); BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 0), 0); @@ -141,8 +141,7 @@ BOOST_AUTO_TEST_CASE(Architecture) { } // constructor - std::vector> send_costs = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, - {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; + std::vector> send_costs = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; BOOST_CHECK_THROW(BspArchitecture test31(7, 42942, 0, send_costs), std::invalid_argument); @@ -169,10 +168,8 @@ BOOST_AUTO_TEST_CASE(Architecture) { } // constructor - std::vector> send_costs2 = {{0, 1, 2, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, - {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; - std::vector> send_costs3 = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, - {3, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; + std::vector> send_costs2 = {{0, 1, 2, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; + std::vector> send_costs3 = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, {3, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; BspArchitecture test4(6, 0, 4294965, send_costs2); BOOST_CHECK_EQUAL(test4.numberOfProcessors(), 6); diff --git a/tests/bsp_instance.cpp b/tests/bsp_instance.cpp index c2b0b02a..101e4b2f 100644 --- a/tests/bsp_instance.cpp +++ b/tests/bsp_instance.cpp @@ -19,12 +19,13 @@ limitations under the License. #define BOOST_TEST_MODULE Bsp_Architecture #include +#include "osp/auxiliary/io/arch_file_reader.hpp" +#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include "osp/bsp/model/BspInstance.hpp" #include "osp/bsp/model/BspSchedule.hpp" +#include "osp/bsp/model/util/CompatibleProcessorRange.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" -#include "osp/auxiliary/io/arch_file_reader.hpp" -#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include #include @@ -43,7 +44,7 @@ BOOST_AUTO_TEST_CASE(test_1) { BspArchitecture architecture_2(6, 3, 1); - instance.setArchitecture(architecture_2); + instance.getArchitecture() = architecture_2; BOOST_CHECK_EQUAL(instance.numberOfProcessors(), 6); BOOST_CHECK_EQUAL(instance.synchronisationCosts(), 1); @@ -84,8 +85,7 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) { BOOST_CHECK_EQUAL(instance.isCompatible(0, 0), true); BOOST_CHECK_EQUAL(instance.isCompatible(1, 0), false); - - compatible_processor_range range(instance); + CompatibleProcessorRange range(instance); BOOST_CHECK_EQUAL(range.compatible_processors_type(0).size(), 3); BOOST_CHECK_EQUAL(range.compatible_processors_type(1).size(), 1); @@ -97,7 +97,6 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) { } std::cout << std::endl; - std::cout << "Compatible processors type 1: " << std::endl; for (const auto &p : range.compatible_processors_type(1)) { @@ -105,7 +104,6 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) { } std::cout << std::endl; - BOOST_CHECK_EQUAL(range.compatible_processors_vertex(0).size(), 1); BOOST_CHECK_EQUAL(range.compatible_processors_vertex(1).size(), 3); BOOST_CHECK_EQUAL(range.compatible_processors_vertex(2).size(), 3); diff --git a/tests/coarser.cpp b/tests/coarser.cpp index e4bd92c3..9c77703d 100644 --- a/tests/coarser.cpp +++ b/tests/coarser.cpp @@ -23,24 +23,24 @@ limitations under the License. #include #include +#include "osp/auxiliary/io/arch_file_reader.hpp" +#include "osp/auxiliary/io/general_file_reader.hpp" +#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include "osp/bsp/scheduler/CoarseAndSchedule.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp" #include "osp/coarser/BspScheduleCoarser.hpp" -#include "osp/coarser/coarser_util.hpp" -#include "osp/coarser/funnel/FunnelBfs.hpp" -#include "osp/coarser/hdagg/hdagg_coarser.hpp" #include "osp/coarser/Sarkar/Sarkar.hpp" #include "osp/coarser/Sarkar/SarkarMul.hpp" #include "osp/coarser/SquashA/SquashA.hpp" #include "osp/coarser/SquashA/SquashAMul.hpp" +#include "osp/coarser/coarser_util.hpp" +#include "osp/coarser/funnel/FunnelBfs.hpp" +#include "osp/coarser/hdagg/hdagg_coarser.hpp" #include "osp/coarser/top_order/top_order_coarser.hpp" -#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" -#include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" #include "osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp" #include "osp/graph_implementations/adj_list_impl/compact_sparse_graph_edge_desc.hpp" -#include "osp/auxiliary/io/arch_file_reader.hpp" -#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" -#include "osp/auxiliary/io/general_file_reader.hpp" +#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" +#include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" #include "test_graphs.hpp" using namespace osp; @@ -121,14 +121,15 @@ BOOST_AUTO_TEST_CASE(coarser_hdagg_test) { std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1); name_graph = name_graph.substr(0, name_graph.find_last_of(".")); - std::cout << std::endl << "Graph: " << name_graph << std::endl; + std::cout << std::endl + << "Graph: " << name_graph << std::endl; using graph_t = computational_dag_edge_idx_vector_impl_def_t; BspInstance instance; bool status_graph = file_reader::readGraph((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); bool status_architecture = file_reader::readBspArchitecture((cwd / "data/machine_params/p3.arch").string(), instance.getArchitecture()); @@ -140,7 +141,7 @@ BOOST_AUTO_TEST_CASE(coarser_hdagg_test) { } BspInstance coarse_instance; - coarse_instance.setArchitecture(instance.getArchitecture()); + coarse_instance.getArchitecture() = instance.getArchitecture(); std::vector> vertex_map; std::vector reverse_vertex_map; @@ -193,7 +194,8 @@ BOOST_AUTO_TEST_CASE(coarser_hdagg_test_diff_graph_impl) { std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1); name_graph = name_graph.substr(0, name_graph.find_last_of(".")); - std::cout << std::endl << "Graph: " << name_graph << std::endl; + std::cout << std::endl + << "Graph: " << name_graph << std::endl; using graph_t1 = computational_dag_edge_idx_vector_impl_def_t; using graph_t2 = computational_dag_vector_impl_def_t; @@ -201,7 +203,7 @@ BOOST_AUTO_TEST_CASE(coarser_hdagg_test_diff_graph_impl) { BspInstance instance; bool status_graph = file_reader::readGraph((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); bool status_architecture = file_reader::readBspArchitecture((cwd / "data/machine_params/p3.arch").string(), instance.getArchitecture()); @@ -214,7 +216,7 @@ BOOST_AUTO_TEST_CASE(coarser_hdagg_test_diff_graph_impl) { BspInstance coarse_instance; BspArchitecture architecture_t2(instance.getArchitecture()); - coarse_instance.setArchitecture(architecture_t2); + coarse_instance.getArchitecture() = architecture_t2; std::vector> vertex_map; std::vector reverse_vertex_map; @@ -265,14 +267,15 @@ BOOST_AUTO_TEST_CASE(coarser_bspschedule_test) { std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1); name_graph = name_graph.substr(0, name_graph.find_last_of(".")); - std::cout << std::endl << "Graph: " << name_graph << std::endl; + std::cout << std::endl + << "Graph: " << name_graph << std::endl; using graph_t = computational_dag_edge_idx_vector_impl_def_t; BspInstance instance; bool status_graph = file_reader::readGraph((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); bool status_architecture = file_reader::readBspArchitecture((cwd / "data/machine_params/p3.arch").string(), instance.getArchitecture()); @@ -284,7 +287,7 @@ BOOST_AUTO_TEST_CASE(coarser_bspschedule_test) { } BspInstance coarse_instance; - coarse_instance.setArchitecture(instance.getArchitecture()); + coarse_instance.getArchitecture() = instance.getArchitecture(); std::vector> vertex_map; std::vector reverse_vertex_map; @@ -345,12 +348,13 @@ void test_coarser_same_graph(Coarser &coarser) { std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1); name_graph = name_graph.substr(0, name_graph.find_last_of(".")); - std::cout << std::endl << "Graph: " << name_graph << std::endl; + std::cout << std::endl + << "Graph: " << name_graph << std::endl; BspInstance instance; bool status_graph = file_reader::readGraph((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); bool status_architecture = file_reader::readBspArchitecture((cwd / "data/machine_params/p3.arch").string(), instance.getArchitecture()); @@ -362,17 +366,15 @@ void test_coarser_same_graph(Coarser &coarser) { } BspInstance coarse_instance; - coarse_instance.setArchitecture(instance.getArchitecture()); + coarse_instance.getArchitecture() = instance.getArchitecture(); std::vector> vertex_map; std::vector reverse_vertex_map; GreedyBspScheduler scheduler; - bool coarse_success = coarser.coarsenDag(instance.getComputationalDag(), coarse_instance.getComputationalDag(), reverse_vertex_map); BOOST_CHECK(coarse_success); - vertex_map = coarser_util::invert_vertex_contraction_map(reverse_vertex_map); BOOST_CHECK(check_vertex_map(vertex_map, instance.getComputationalDag().num_vertices())); @@ -446,27 +448,20 @@ BOOST_AUTO_TEST_CASE(squashA_test) { SquashA coarser(params); test_coarser_same_graph(coarser); - - + params.mode = SquashAParams::Mode::TRIANGLES; params.use_structured_poset = true; params.use_top_poset = true; coarser.setParams(params); - + test_coarser_same_graph(coarser); params.use_top_poset = false; coarser.setParams(params); - + test_coarser_same_graph(coarser); } - - - - - - BOOST_AUTO_TEST_CASE(coarser_SquashA_test_diff_graph_impl_CSG) { // static_assert(std::is_base_of::value, "Class is not a scheduler!"); std::vector filenames_graph = tiny_spaa_graphs(); @@ -484,7 +479,8 @@ BOOST_AUTO_TEST_CASE(coarser_SquashA_test_diff_graph_impl_CSG) { std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1); name_graph = name_graph.substr(0, name_graph.find_last_of(".")); - std::cout << std::endl << "Graph: " << name_graph << std::endl; + std::cout << std::endl + << "Graph: " << name_graph << std::endl; using graph_t1 = computational_dag_edge_idx_vector_impl_def_t; using graph_t2 = CSG; @@ -492,7 +488,7 @@ BOOST_AUTO_TEST_CASE(coarser_SquashA_test_diff_graph_impl_CSG) { BspInstance instance; bool status_graph = file_reader::readGraph((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); bool status_architecture = file_reader::readBspArchitecture((cwd / "data/machine_params/p3.arch").string(), instance.getArchitecture()); @@ -505,7 +501,7 @@ BOOST_AUTO_TEST_CASE(coarser_SquashA_test_diff_graph_impl_CSG) { BspInstance coarse_instance; BspArchitecture architecture_t2(instance.getArchitecture()); - coarse_instance.setArchitecture(architecture_t2); + coarse_instance.getArchitecture() = architecture_t2; std::vector> vertex_map; std::vector reverse_vertex_map; @@ -560,7 +556,8 @@ BOOST_AUTO_TEST_CASE(coarser_SquashA_test_diff_graph_impl_CSGE) { std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1); name_graph = name_graph.substr(0, name_graph.find_last_of(".")); - std::cout << std::endl << "Graph: " << name_graph << std::endl; + std::cout << std::endl + << "Graph: " << name_graph << std::endl; using graph_t1 = computational_dag_edge_idx_vector_impl_def_t; using graph_t2 = CSGE; @@ -568,7 +565,7 @@ BOOST_AUTO_TEST_CASE(coarser_SquashA_test_diff_graph_impl_CSGE) { BspInstance instance; bool status_graph = file_reader::readGraph((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); bool status_architecture = file_reader::readBspArchitecture((cwd / "data/machine_params/p3.arch").string(), instance.getArchitecture()); @@ -581,7 +578,7 @@ BOOST_AUTO_TEST_CASE(coarser_SquashA_test_diff_graph_impl_CSGE) { BspInstance coarse_instance; BspArchitecture architecture_t2(instance.getArchitecture()); - coarse_instance.setArchitecture(architecture_t2); + coarse_instance.getArchitecture() = architecture_t2; std::vector> vertex_map; std::vector reverse_vertex_map; @@ -619,13 +616,6 @@ BOOST_AUTO_TEST_CASE(coarser_SquashA_test_diff_graph_impl_CSGE) { } } - - - - - - - BOOST_AUTO_TEST_CASE(Sarkar_test) { using graph_t = computational_dag_edge_idx_vector_impl_def_t; // using graph_t = computational_dag_vector_impl_def_t; @@ -639,58 +629,47 @@ BOOST_AUTO_TEST_CASE(Sarkar_test) { test_coarser_same_graph(coarser); - params.useTopPoset = false; coarser.setParameters(params); test_coarser_same_graph(coarser); - - + params.mode = SarkarParams::Mode::FAN_IN_FULL; coarser.setParameters(params); test_coarser_same_graph(coarser); - params.mode = SarkarParams::Mode::FAN_IN_PARTIAL; coarser.setParameters(params); test_coarser_same_graph(coarser); - params.mode = SarkarParams::Mode::FAN_OUT_FULL; coarser.setParameters(params); test_coarser_same_graph(coarser); - params.mode = SarkarParams::Mode::FAN_OUT_PARTIAL; coarser.setParameters(params); test_coarser_same_graph(coarser); - params.mode = SarkarParams::Mode::LEVEL_EVEN; coarser.setParameters(params); test_coarser_same_graph(coarser); - - + params.mode = SarkarParams::Mode::LEVEL_ODD; coarser.setParameters(params); test_coarser_same_graph(coarser); - params.mode = SarkarParams::Mode::FAN_IN_BUFFER; coarser.setParameters(params); test_coarser_same_graph(coarser); - params.mode = SarkarParams::Mode::FAN_OUT_BUFFER; coarser.setParameters(params); test_coarser_same_graph(coarser); - params.mode = SarkarParams::Mode::HOMOGENEOUS_BUFFER; coarser.setParameters(params); test_coarser_same_graph(coarser); } - BOOST_AUTO_TEST_CASE(SarkarML_test) { using graph_t = computational_dag_edge_idx_vector_impl_def_t; // using graph_t = computational_dag_vector_impl_def_t; @@ -723,6 +702,6 @@ BOOST_AUTO_TEST_CASE(SquashAML_test) { // using graph_t = computational_dag_vector_impl_def_t; SquashAMul coarser; - + test_coarser_same_graph(coarser); } \ No newline at end of file diff --git a/tests/debug_merkle_divider.cpp b/tests/debug_merkle_divider.cpp index bf3bd1b5..5763d840 100644 --- a/tests/debug_merkle_divider.cpp +++ b/tests/debug_merkle_divider.cpp @@ -16,24 +16,23 @@ limitations under the License. @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner */ -#include -#include "osp/auxiliary/io/dot_graph_file_reader.hpp" #include "osp/auxiliary/io/DotFileWriter.hpp" +#include "osp/auxiliary/io/dot_graph_file_reader.hpp" #include "osp/bsp/scheduler/GreedySchedulers/BspLocking.hpp" -#include "osp/bsp/scheduler/Serial.hpp" -#include "osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GreedyChildren.hpp" +#include "osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCores.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp" +#include "osp/bsp/scheduler/Serial.hpp" #include "osp/coarser/coarser_util.hpp" #include "osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" +#include using namespace osp; - template -void check_partition_type_homogeneity(const GraphT& dag, const std::vector>& partition) { +void check_partition_type_homogeneity(const GraphT &dag, const std::vector> &partition) { // Group partitions by their ID std::map, std::vector>> partitions; for (vertex_idx_t i = 0; i < dag.num_vertices(); ++i) { @@ -41,19 +40,20 @@ void check_partition_type_homogeneity(const GraphT& dag, const std::vector" << std::endl; return 1; @@ -76,15 +76,12 @@ int main(int argc, char* argv[]) { instance.getComputationalDag().set_vertex_comm_weight(v, static_cast>(instance.getComputationalDag().vertex_comm_weight(v) * 0.01)); } - // Set up architecture - instance.getArchitecture().set_processors_consequ_types({24,48},{100,100}); + instance.getArchitecture().SetProcessorsConsequTypes({24, 48}, {100, 100}); instance.setDiagonalCompatibilityMatrix(2); instance.setSynchronisationCosts(2000); instance.setCommunicationCosts(1); - - // Set up the scheduler GrowLocalAutoCores growlocal; BspLocking locking; @@ -95,9 +92,9 @@ int main(int argc, char* argv[]) { ComboScheduler growlocal_kl(growlocal, kl); ComboScheduler locking_kl(locking, kl); ComboScheduler children_kl(children, kl); - + GreedyMetaScheduler scheduler; - //scheduler.addScheduler(growlocal_kl); + // scheduler.addScheduler(growlocal_kl); scheduler.addScheduler(locking_kl); scheduler.addScheduler(children_kl); scheduler.addSerialScheduler(); @@ -120,7 +117,7 @@ int main(int argc, char* argv[]) { graph_t corase_graph; coarser_util::construct_coarse_dag(instance.getComputationalDag(), corase_graph, partition); bool acyc = is_acyclic(corase_graph); - std::cout << "Partition is " << (acyc ? "acyclic." : "not acyclic."); + std::cout << "Partition is " << (acyc ? "acyclic." : "not acyclic."); std::cout << "Partition computation finished." << std::endl; std::cout << "Generated " << std::set>(partition.begin(), partition.end()).size() << " partitions." << std::endl; diff --git a/tests/kl_bsp_improver_test.cpp b/tests/kl_bsp_improver_test.cpp index df3ac3f1..6e1611ec 100644 --- a/tests/kl_bsp_improver_test.cpp +++ b/tests/kl_bsp_improver_test.cpp @@ -152,7 +152,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { // std::vector> send_cost = {{0, 1, 4, 4}, {1, 0, 4, 4}, {4, 4, 0, 1}, {4, 4, 1, 0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { diff --git a/tests/kl_lambda.cpp b/tests/kl_lambda.cpp index a7f40cf4..31f86130 100644 --- a/tests/kl_lambda.cpp +++ b/tests/kl_lambda.cpp @@ -25,14 +25,14 @@ limitations under the License. #include "osp/bsp/scheduler/LocalSearch/KernighanLin/kl_total_comm.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin/kl_total_cut.hpp" +#include "osp/auxiliary/io/arch_file_reader.hpp" +#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp" -#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" -#include "osp/auxiliary/io/arch_file_reader.hpp" -#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" -#include "test_graphs.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" +#include "test_graphs.hpp" using namespace osp; @@ -56,37 +56,38 @@ void add_node_types(Graph_t &dag) { for (const auto &v : dag.vertices()) { dag.set_vertex_type(v, node_type++ % 2); - } + } } template -void check_equal_affinity_table(table_t & table_1, table_t & table_2, const std::set & nodes) { +void check_equal_affinity_table(table_t &table_1, table_t &table_2, const std::set &nodes) { - for ( auto i : nodes) { + for (auto i : nodes) { BOOST_CHECK_EQUAL(table_1[i].size(), table_2[i].size()); - if (table_1[i].size() != table_2[i].size()) continue; + if (table_1[i].size() != table_2[i].size()) + continue; for (size_t j = 0; j < table_1[i].size(); ++j) { BOOST_CHECK_EQUAL(table_1[i][j].size(), table_2[i][j].size()); - if (table_1[i][j].size() != table_2[i][j].size()) continue; + if (table_1[i][j].size() != table_2[i][j].size()) + continue; for (size_t k = 0; k < table_1[i][j].size(); ++k) { BOOST_CHECK(std::abs(table_1[i][j][k] - table_2[i][j][k]) < 0.000001); - if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) { - std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl; - + if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) { + std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl; } } } } } -void check_equal_lambda_map(const std::vector> & map_1, const std::vector> & map_2) { +void check_equal_lambda_map(const std::vector> &map_1, const std::vector> &map_2) { BOOST_CHECK_EQUAL(map_1.size(), map_2.size()); if (map_1.size() != map_2.size()) return; for (size_t i = 0; i < map_1.size(); ++i) { - for (const auto & [key, value] : map_1[i]) { + for (const auto &[key, value] : map_1[i]) { BOOST_CHECK_EQUAL(value, map_2[i].at(key)); if (value != map_2[i].at(key)) { @@ -117,7 +118,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_with_node_types_test) { BspInstance instance; bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); instance.getArchitecture().setSynchronisationCosts(5); instance.getArchitecture().setCommunicationCosts(5); @@ -134,7 +135,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_with_node_types_test) { add_mem_weights(instance.getComputationalDag()); add_node_types(instance.getComputationalDag()); - instance.getArchitecture().setProcessorsWithTypes({0,0,1,1}); + instance.getArchitecture().setProcessorsWithTypes({0, 0, 1, 1}); instance.setDiagonalCompatibilityMatrix(2); @@ -147,18 +148,15 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_with_node_types_test) { BOOST_CHECK(schedule.satisfiesNodeTypeConstraints()); kl_total_lambda_comm_improver kl; - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); BOOST_CHECK(schedule.satisfiesNodeTypeConstraints()); - } } - - BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { std::vector filenames_graph = test_graphs(); @@ -180,7 +178,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { BspInstance instance; bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); instance.getArchitecture().setSynchronisationCosts(5); instance.getArchitecture().setCommunicationCosts(5); @@ -204,7 +202,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); kl_total_lambda_comm_improver kl; - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -252,11 +250,11 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); // schedule.updateNumberOfSupersteps(); - -// using cost_f = kl_hyper_total_comm_cost_function; + +// using cost_f = kl_hyper_total_comm_cost_function; // using kl_improver_test = kl_improver_test; // kl_improver_test kl; - + // kl.setup_schedule(schedule); // auto &kl_active_schedule = kl.get_active_schedule(); @@ -269,7 +267,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + // BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); // BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true); @@ -369,7 +367,6 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // using graph = computational_dag_edge_idx_vector_impl_def_int_t; // using VertexType = graph::vertex_idx; // using kl_move = kl_move_struct; - // graph dag; @@ -401,11 +398,11 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); // schedule.updateNumberOfSupersteps(); - -// using cost_f = kl_hyper_total_comm_cost_function; + +// using cost_f = kl_hyper_total_comm_cost_function; // using kl_improver_test = kl_improver_test; // kl_improver_test kl; - + // kl.setup_schedule(schedule); // auto &kl_active_schedule = kl.get_active_schedule(); @@ -418,7 +415,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + // auto node_selection = kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7}); // std::set nodes_to_check = {0, 1, 2, 3, 4, 5, 6, 7}; @@ -533,11 +530,10 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_penalty_test) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_hyper_total_comm_cost_function; + using cost_f = kl_hyper_total_comm_cost_function; using kl_improver_test = kl_improver_test; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -550,48 +546,47 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_penalty_test) { BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true); - auto node_selection = kl.insert_gain_heap_test_penalty({2,3}); + auto node_selection = kl.insert_gain_heap_test_penalty({2, 3}); auto recompute_max_gain = kl.run_inner_iteration_test(); // best move 3 - std::cout << "------------------------recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "------------------------recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } - std::cout << "}" << std::endl; + } + std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); // best move 0 - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); // best move 1 - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; - + BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - } BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { @@ -629,27 +624,27 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); schedule.updateNumberOfSupersteps(); - - using cost_f = kl_hyper_total_comm_cost_function; + + using cost_f = kl_hyper_total_comm_cost_function; using kl_improver_test = kl_improver_test; kl_improver_test kl; - + kl.setup_schedule(schedule); BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - auto node_selection = kl.insert_gain_heap_test_penalty({7}); + auto node_selection = kl.insert_gain_heap_test_penalty({7}); auto recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "-----------recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "-----------recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } - std::cout << "}" << std::endl; + } + std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - - auto& lambda_map = kl.get_comm_cost_f().node_lambda_map; + + auto &lambda_map = kl.get_comm_cost_f().node_lambda_map; BOOST_CHECK(lambda_map.get_proc_entry(v1, 0) == 2); BOOST_CHECK(lambda_map.get_proc_entry(v1, 1) == 1); @@ -669,32 +664,31 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { BOOST_CHECK(lambda_map.has_no_proc_entry(v8, 0)); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - } // BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs) { @@ -708,7 +702,6 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // std::cout << cwd << std::endl; // } - // for (auto &filename_graph : filenames_graph) { // GreedyBspScheduler test_scheduler; // BspInstance instance; @@ -724,7 +717,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // {4,4,0,1}, // {4,4,1,0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { @@ -752,7 +745,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // auto status = kl.improveSchedule(schedule); // auto finish_time = std::chrono::high_resolution_clock::now(); // auto duration = std::chrono::duration_cast(finish_time - start_time).count(); - + // std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalLambdaCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl; // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -763,18 +756,17 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // // start_time = std::chrono::high_resolution_clock::now(); // // status = kl_old.improve_schedule_test_2(schedule_2); // // finish_time = std::chrono::high_resolution_clock::now(); - + // // duration = std::chrono::duration_cast(finish_time - start_time).count(); // // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl; - + // // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); // // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); // } // } - // BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs_mt) { // std::vector filenames_graph = large_spaa_graphs(); // using graph = computational_dag_edge_idx_vector_impl_def_int_t; @@ -786,7 +778,6 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // std::cout << cwd << std::endl; // } - // for (auto &filename_graph : filenames_graph) { // GreedyBspScheduler test_scheduler; // BspInstance instance; @@ -802,7 +793,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // {4,4,0,1}, // {4,4,1,0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { @@ -830,7 +821,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // auto status = kl.improveSchedule(schedule); // auto finish_time = std::chrono::high_resolution_clock::now(); // auto duration = std::chrono::duration_cast(finish_time - start_time).count(); - + // std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalLambdaCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl; // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -841,11 +832,11 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // // start_time = std::chrono::high_resolution_clock::now(); // // status = kl_old.improve_schedule_test_2(schedule_2); // // finish_time = std::chrono::high_resolution_clock::now(); - + // // duration = std::chrono::duration_cast(finish_time - start_time).count(); // // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl; - + // // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); // // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); diff --git a/tests/kl_total.cpp b/tests/kl_total.cpp index 5d3d1486..58421144 100644 --- a/tests/kl_total.cpp +++ b/tests/kl_total.cpp @@ -22,18 +22,17 @@ limitations under the License. #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp" +#include "osp/auxiliary/io/arch_file_reader.hpp" +#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp" -#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" -#include "osp/auxiliary/io/arch_file_reader.hpp" -#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" -#include "test_graphs.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" +#include "test_graphs.hpp" using namespace osp; - template void add_mem_weights(Graph_t &dag) { @@ -49,17 +48,16 @@ void add_mem_weights(Graph_t &dag) { } template -void check_equal_affinity_table(table_t & table_1, table_t & table_2, const std::set & nodes) { +void check_equal_affinity_table(table_t &table_1, table_t &table_2, const std::set &nodes) { BOOST_CHECK_EQUAL(table_1.size(), table_2.size()); - for ( auto i : nodes) { + for (auto i : nodes) { for (size_t j = 0; j < table_1[i].size(); ++j) { for (size_t k = 0; k < table_1[i][j].size(); ++k) { BOOST_CHECK(std::abs(table_1[i][j][k] - table_2[i][j][k]) < 0.000001); - if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) { - std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl; - + if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) { + std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl; } } } @@ -102,16 +100,13 @@ BOOST_AUTO_TEST_CASE(kl_improver_smoke_test) { schedule.updateNumberOfSupersteps(); - using kl_improver_t = kl_total_comm_improver; kl_improver_t kl; - - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); - } BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) { @@ -135,7 +130,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) { BspInstance instance; bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); instance.getArchitecture().setSynchronisationCosts(5); instance.getArchitecture().setCommunicationCosts(5); @@ -147,7 +142,6 @@ BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) { BOOST_CHECK(false); } - add_mem_weights(instance.getComputationalDag()); BspSchedule schedule(instance); @@ -158,7 +152,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) { BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); kl_total_comm_improver kl; - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -170,7 +164,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_superstep_removal_test) { using graph = computational_dag_edge_idx_vector_impl_def_int_t; using VertexType = graph::vertex_idx; - + graph dag; const VertexType v1 = dag.add_vertex(2, 9, 2); @@ -200,13 +194,13 @@ BOOST_AUTO_TEST_CASE(kl_improver_superstep_removal_test) { // Create a schedule with an almost empty superstep (step 1) schedule.setAssignedProcessors({0, 0, 0, 0, 1, 1, 1, 1}); schedule.setAssignedSupersteps({0, 0, 0, 0, 1, 2, 2, 2}); - + schedule.updateNumberOfSupersteps(); unsigned original_steps = schedule.numberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + + using cost_f = kl_total_comm_cost_function; kl_improver kl; - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -250,11 +244,10 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; using kl_improver_test = kl_improver_test; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -267,13 +260,13 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true); - auto node_selection = kl.insert_gain_heap_test_penalty({2,3}); + auto node_selection = kl.insert_gain_heap_test_penalty({2, 3}); - auto& affinity = kl.get_affinity_table(); + auto &affinity = kl.get_affinity_table(); BOOST_CHECK_CLOSE(affinity[v3][0][0], 5.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v3][0][1], 4.0, 0.00001); @@ -290,41 +283,40 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { BOOST_CHECK_CLOSE(affinity[v4][1][2], -3.5, 0.00001); auto recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "------------------------recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "------------------------recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } - std::cout << "}" << std::endl; + } + std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - + recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - } BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_penalty_test) { @@ -363,55 +355,53 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_penalty_test) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; using kl_improver_test = kl_improver_test; kl_improver_test kl; - + kl.setup_schedule(schedule); - //auto &kl_active_schedule = kl.get_active_schedule(); + // auto &kl_active_schedule = kl.get_active_schedule(); BOOST_CHECK_CLOSE(51.5, kl.get_current_cost(), 0.00001); - auto node_selection = kl.insert_gain_heap_test_penalty({7}); + auto node_selection = kl.insert_gain_heap_test_penalty({7}); auto recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "-----------recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "-----------recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } - std::cout << "}" << std::endl; + } + std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - + recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - } BOOST_AUTO_TEST_CASE(kl_improver_violation_handling_test) { @@ -450,16 +440,15 @@ BOOST_AUTO_TEST_CASE(kl_improver_violation_handling_test) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; kl_improver_test kl; - + kl.setup_schedule(schedule); kl.compute_violations_test(); BOOST_CHECK_EQUAL(kl.is_feasible(), false); - + kl_improver kl_improver; kl_improver.improveSchedule(schedule); @@ -502,10 +491,9 @@ BOOST_AUTO_TEST_CASE(kl_base_1) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -529,11 +517,11 @@ BOOST_AUTO_TEST_CASE(kl_base_1) { BOOST_CHECK_EQUAL(kl.is_feasible(), false); BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001); - kl_move move_2(v2, 3.0 + 4.5 - 4.0 , 0, 0, 1, 0); + kl_move move_2(v2, 3.0 + 4.5 - 4.0, 0, 0, 1, 0); kl.apply_move_test(move_2); - BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(0), 39.0); // 42-3 + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(0), 39.0); // 42-3 BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(0), 5.0); // 2+3 BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 1); BOOST_CHECK_EQUAL(kl.is_feasible(), false); @@ -541,7 +529,7 @@ BOOST_AUTO_TEST_CASE(kl_base_1) { kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7}); - auto& affinity = kl.get_affinity_table(); + auto &affinity = kl.get_affinity_table(); BOOST_CHECK_CLOSE(affinity[v1][0][1], 2.0 - 4.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][1][1], 0.0, 0.00001); @@ -598,10 +586,9 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -614,7 +601,7 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001); BOOST_CHECK_EQUAL(kl.is_feasible(), true); @@ -636,7 +623,7 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { BOOST_CHECK_EQUAL(kl.is_feasible(), true); BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001); - kl_move move_2(v2, -1.0 - 8.5 , 1, 1, 0, 0); + kl_move move_2(v2, -1.0 - 8.5, 1, 1, 0, 0); kl.apply_move_test(move_2); @@ -652,7 +639,7 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { BOOST_CHECK_EQUAL(kl.is_feasible(), false); BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001); - kl_move move_x(v2, -2.0 + 8.5 , 0, 0, 1, 0); + kl_move move_x(v2, -2.0 + 8.5, 0, 0, 1, 0); kl.apply_move_test(move_x); @@ -670,14 +657,13 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7}); - auto& affinity = kl.get_affinity_table(); + auto &affinity = kl.get_affinity_table(); BOOST_CHECK_CLOSE(affinity[v1][0][1], -4.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][0][2], -2.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][1][1], 2.0, 0.00001); - BOOST_CHECK_CLOSE(affinity[v1][1][2], 0.0, 0.00001); - + BOOST_CHECK_CLOSE(affinity[v1][1][2], 0.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v2][0][1], 9.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v2][0][2], 11.5, 0.00001); @@ -719,7 +705,6 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { BOOST_CHECK_CLOSE(affinity[v7][1][0], 7.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v7][1][1], 8.0, 0.00001); - BOOST_CHECK_CLOSE(affinity[v8][0][0], 8.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v8][0][1], 8.5, 0.00001); @@ -763,10 +748,9 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -779,21 +763,19 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true); kl.insert_gain_heap_test_penalty({0, 1, 2, 3, 4, 5, 6, 7}); - auto& affinity = kl.get_affinity_table(); - + auto &affinity = kl.get_affinity_table(); BOOST_CHECK_CLOSE(affinity[v1][0][1], 1.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][0][2], 3.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][1][1], 2.0, 0.00001); - BOOST_CHECK_CLOSE(affinity[v1][1][2], 16.5, 0.00001); - + BOOST_CHECK_CLOSE(affinity[v1][1][2], 16.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v2][0][1], 15, 0.00001); BOOST_CHECK_CLOSE(affinity[v2][0][2], 11.5, 0.00001); @@ -835,16 +817,13 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { BOOST_CHECK_CLOSE(affinity[v7][1][0], 7.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v7][1][1], 8.0, 0.00001); - BOOST_CHECK_CLOSE(affinity[v8][0][0], 14.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v8][0][1], 8.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v8][1][0], 8.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v8][1][1], 1.0, 0.00001); - } - // BOOST_AUTO_TEST_CASE(kl_improver_incremental_update_test) { // using graph = computational_dag_edge_idx_vector_impl_def_int_t; @@ -881,12 +860,11 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); // schedule.updateNumberOfSupersteps(); - - -// using cost_f = kl_total_comm_cost_function; + +// using cost_f = kl_total_comm_cost_function; // using kl_improver_test = kl_improver_test; // kl_improver_test kl; - + // kl.setup_schedule(schedule); // auto node_selection = kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7}); @@ -974,7 +952,6 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // }; - // BOOST_AUTO_TEST_CASE(kl_total_comm_large_test_graphs) { // std::vector filenames_graph = large_spaa_graphs(); // using graph = computational_dag_edge_idx_vector_impl_def_int_t; @@ -987,7 +964,6 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // std::cout << cwd << std::endl; // } - // for (auto &filename_graph : filenames_graph) { // GreedyBspScheduler test_scheduler; // BspInstance instance; @@ -1003,7 +979,7 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // {4,4,0,1}, // {4,4,1,0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { @@ -1031,9 +1007,9 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // auto start_time = std::chrono::high_resolution_clock::now(); // auto status = kl.improveSchedule(schedule); // auto finish_time = std::chrono::high_resolution_clock::now(); - + // auto duration = std::chrono::duration_cast(finish_time - start_time).count(); - + // std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl; // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -1044,18 +1020,17 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // // start_time = std::chrono::high_resolution_clock::now(); // // status = kl_old.improve_schedule_test_2(schedule_2); // // finish_time = std::chrono::high_resolution_clock::now(); - + // // duration = std::chrono::duration_cast(finish_time - start_time).count(); // // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl; - + // // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); // // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); // } // } - // BOOST_AUTO_TEST_CASE(kl_total_comm_large_test_graphs_mt) { // std::vector filenames_graph = large_spaa_graphs(); // using graph = computational_dag_edge_idx_vector_impl_def_int_t; @@ -1068,7 +1043,6 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // std::cout << cwd << std::endl; // } - // for (auto &filename_graph : filenames_graph) { // GreedyBspScheduler test_scheduler; // BspInstance instance; @@ -1084,7 +1058,7 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // {4,4,0,1}, // {4,4,1,0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { @@ -1112,9 +1086,9 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // auto start_time = std::chrono::high_resolution_clock::now(); // auto status = kl.improveSchedule(schedule); // auto finish_time = std::chrono::high_resolution_clock::now(); - + // auto duration = std::chrono::duration_cast(finish_time - start_time).count(); - + // std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl; // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -1125,11 +1099,11 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // // start_time = std::chrono::high_resolution_clock::now(); // // status = kl_old.improve_schedule_test_2(schedule_2); // // finish_time = std::chrono::high_resolution_clock::now(); - + // // duration = std::chrono::duration_cast(finish_time - start_time).count(); // // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl; - + // // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); // // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); diff --git a/tests/trimmed_group_scheduler.cpp b/tests/trimmed_group_scheduler.cpp index 52cf4cdb..ccbfee8a 100644 --- a/tests/trimmed_group_scheduler.cpp +++ b/tests/trimmed_group_scheduler.cpp @@ -19,10 +19,10 @@ limitations under the License. #define BOOST_TEST_MODULE TrimmedGroupSchedulerTest #include -#include "osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp" #include "osp/bsp/model/BspInstance.hpp" #include "osp/bsp/model/BspSchedule.hpp" #include "osp/bsp/scheduler/Scheduler.hpp" +#include "osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" using namespace osp; @@ -30,9 +30,9 @@ using namespace osp; using graph_t = computational_dag_vector_impl_def_t; // Mock SubScheduler for TrimmedGroupScheduler tests -template +template class MockSubScheduler : public Scheduler { -public: + public: // This mock scheduler assigns all nodes to local processor 0 and superstep 0. // This simplifies verification of the TrimmedGroupScheduler's mapping logic. RETURN_STATUS computeSchedule(BspSchedule &schedule) override { @@ -66,7 +66,7 @@ BOOST_FIXTURE_TEST_SUITE(TrimmedGroupSchedulerTestSuite, TrimmedGroupSchedulerFi BOOST_AUTO_TEST_CASE(EmptyGraphTest) { // Graph is empty by default arch.setNumberOfProcessors(4); - instance.setArchitecture(arch); + instance.getArchitecture() = arch; TrimmedGroupScheduler scheduler(mock_sub_scheduler, 1); BspSchedule schedule(instance); @@ -87,7 +87,7 @@ BOOST_AUTO_TEST_CASE(SingleComponentSingleProcessorTypeTest) { // Architecture: 4 processors of type 0 arch.setProcessorsWithTypes({0, 0, 0, 0}); - instance.setArchitecture(arch); + instance.getArchitecture() = arch; // min_non_zero_procs_ = 1 (all 4 processors assigned to this single component group) TrimmedGroupScheduler scheduler(mock_sub_scheduler, 1); @@ -119,7 +119,7 @@ BOOST_AUTO_TEST_CASE(MultipleComponentsSingleProcessorTypeEvenDistributionTest) // Architecture: 4 processors of type 0 arch.setProcessorsWithTypes({0, 0, 0, 0}); - instance.setArchitecture(arch); + instance.getArchitecture() = arch; // min_non_zero_procs_ = 2 (2 component groups, each gets 2 processors) TrimmedGroupScheduler scheduler(mock_sub_scheduler, 2); @@ -154,7 +154,7 @@ BOOST_AUTO_TEST_CASE(MultipleComponentsSingleProcessorTypeUnevenDistributionTest // Architecture: 6 processors of type 0 arch.setProcessorsWithTypes({0, 0, 0, 0, 0, 0}); - instance.setArchitecture(arch); + instance.getArchitecture() = arch; // min_non_zero_procs_ = 2 (3 components, 2 groups) // base_count = 3 / 2 = 1, remainder = 3 % 2 = 1 @@ -190,7 +190,7 @@ BOOST_AUTO_TEST_CASE(MultipleComponentsHeterogeneousArchitectureTest) { // Architecture: 2 processors of type 0 (global 0,1), 2 processors of type 1 (global 2,3) arch.setProcessorsWithTypes({0, 0, 1, 1}); - instance.setArchitecture(arch); + instance.getArchitecture() = arch; instance.setDiagonalCompatibilityMatrix(2); // Node type 0 compatible with proc type 0, etc. // min_non_zero_procs_ = 2 (2 components, 2 groups)