diff --git a/include/osp/auxiliary/io/arch_file_reader.hpp b/include/osp/auxiliary/io/arch_file_reader.hpp index 4e100ba8..71b0f006 100644 --- a/include/osp/auxiliary/io/arch_file_reader.hpp +++ b/include/osp/auxiliary/io/arch_file_reader.hpp @@ -18,10 +18,10 @@ limitations under the License. #pragma once +#include "osp/bsp/model/BspArchitecture.hpp" #include #include #include -#include "osp/bsp/model/BspArchitecture.hpp" namespace osp { namespace file_reader { @@ -31,7 +31,8 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture &archit // Skip comment lines while (std::getline(infile, line)) { - if (!line.empty() && line[0] != '%') break; + if (!line.empty() && line[0] != '%') + break; } // Parse architecture parameters @@ -58,24 +59,24 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture &archit if (0 <= mem_type && mem_type <= 3) { using memw_t = v_memw_t; switch (mem_type) { - case 0: - architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::NONE); - break; - case 1: - architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::LOCAL); - architecture.setMemoryBound(static_cast(M)); - break; - case 2: - architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::GLOBAL); - architecture.setMemoryBound(static_cast(M)); - break; - case 3: - architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT); - architecture.setMemoryBound(static_cast(M)); - break; - default: - std::cerr << "Invalid memory type.\n"; - return false; + case 0: + architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::NONE); + break; + case 1: + architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::LOCAL); + architecture.setMemoryBound(static_cast(M)); + break; + case 2: + architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::GLOBAL); + architecture.setMemoryBound(static_cast(M)); + break; + case 3: + architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT); + architecture.setMemoryBound(static_cast(M)); + break; + default: + std::cerr << "Invalid memory type.\n"; + return false; } } else if (mem_type == -1) { std::cout << "No memory type specified. Assuming \"NONE\".\n"; @@ -116,7 +117,7 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture &archit return false; } - architecture.setSendCosts(fromProc, toProc, static_cast>(value)); + architecture.SetSendCosts(fromProc, toProc, static_cast>(value)); } // Ensure there are no remaining non-comment lines @@ -127,7 +128,6 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture &archit } } - architecture.computeCommAverage(); return true; } diff --git a/include/osp/bsp/model/BspArchitecture.hpp b/include/osp/bsp/model/BspArchitecture.hpp index 8ac1c0a8..74872aae 100644 --- a/include/osp/bsp/model/BspArchitecture.hpp +++ b/include/osp/bsp/model/BspArchitecture.hpp @@ -34,46 +34,51 @@ limitations under the License. namespace osp { -static constexpr unsigned CacheLineSize = 64; - +/** + * @enum MEMORY_CONSTRAINT_TYPE + * @brief Enumerates the different types of memory constraints. + * Memory bounds are set per processor and apply to aggregated memory weights of nodes according to the different types of memory constraints. + */ enum class MEMORY_CONSTRAINT_TYPE { - NONE, - LOCAL, - GLOBAL, - PERSISTENT_AND_TRANSIENT, - LOCAL_IN_OUT, - LOCAL_INC_EDGES, - LOCAL_SOURCES_INC_EDGES + NONE, /** No memory constraints. */ + LOCAL, /** The memory bounds apply to the sum of memory weights of nodes assigned to the same processor and superstep. */ + GLOBAL, /** The memory bounds apply to the sum of memory weights of the nodes assigned to the same processor. */ + PERSISTENT_AND_TRANSIENT, /** Memory bounds apply to the sum of memory weights of nodes assigned to the same processor plus the maximum communication weight of a node assigned to a processor. */ + LOCAL_IN_OUT, /** Memory constraints are local in-out. Experimental. */ + LOCAL_INC_EDGES, /** Memory constraints are local incident edges. Experimental. */ + LOCAL_SOURCES_INC_EDGES /** Memory constraints are local source incident edges. Experimental. */ }; -inline std::ostream &operator<<(std::ostream &os, MEMORY_CONSTRAINT_TYPE type) { +/** + * @brief Converts the enum to a string literal. + * Returns const char* to avoid std::string allocation overhead. + */ +inline const char *to_string(MEMORY_CONSTRAINT_TYPE type) { switch (type) { case MEMORY_CONSTRAINT_TYPE::NONE: - os << "NONE"; - break; + return "NONE"; case MEMORY_CONSTRAINT_TYPE::LOCAL: - os << "LOCAL"; - break; + return "LOCAL"; case MEMORY_CONSTRAINT_TYPE::GLOBAL: - os << "GLOBAL"; - break; + return "GLOBAL"; case MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT: - os << "PERSISTENT_AND_TRANSIENT"; - break; + return "PERSISTENT_AND_TRANSIENT"; case MEMORY_CONSTRAINT_TYPE::LOCAL_IN_OUT: - os << "LOCAL_IN_OUT"; - break; + return "LOCAL_IN_OUT"; case MEMORY_CONSTRAINT_TYPE::LOCAL_INC_EDGES: - os << "LOCAL_INC_EDGES"; - break; + return "LOCAL_INC_EDGES"; case MEMORY_CONSTRAINT_TYPE::LOCAL_SOURCES_INC_EDGES: - os << "LOCAL_SOURCES_INC_EDGES"; - break; + return "LOCAL_SOURCES_INC_EDGES"; default: - os << "UNKNOWN"; - break; + return "UNKNOWN"; } - return os; +} + +/** + * @brief Stream operator overload using the helper function. + */ +inline std::ostream &operator<<(std::ostream &os, MEMORY_CONSTRAINT_TYPE type) { + return os << to_string(type); } /** @@ -81,8 +86,28 @@ inline std::ostream &operator<<(std::ostream &os, MEMORY_CONSTRAINT_TYPE type) { * @brief Represents the architecture of a BSP (Bulk Synchronous Parallel) system. * * The BspArchitecture class stores information about the number of processors, communication costs, - * synchronization costs, and send costs between processors in a BSP system. It provides methods to - * set and retrieve these values. + * synchronization costs, the send costs between processors, the types of processors, and the memory + * bounds. It provides methods to set and retrieve these values. + * + * **Processors:** + * The architecture consists of p processors, indexed from 0 to p-1. + * + * **Processor Types:** + * Processors can have different types, which are represented by non-negative integers. + * Processor types are assumed to be consecutive integers starting from 0. + * + * **Communication and Synchronization Costs:** + * - Communication Cost (g): The cost of communicating a unit of data between processors, i.e., the bandwidth. + * - Synchronization Cost (L): The cost of synchronizing all processors at the end of a superstep. + * + * **Send Costs (NUMA):** + * The architecture supports Non-Uniform Memory Access (NUMA) effects via a send cost matrix. + * The cost to send data from processor i to processor j is given by g * sendCosts[i][j]. + * By default, send costs are uniform (1 for distinct processors, 0 for self). + * + * **Memory Constraints:** + * Each processor has a memory bound. The `MEMORY_CONSTRAINT_TYPE` determines how these bounds are applied + * (e.g., local per superstep, global per processor). */ template class BspArchitecture { @@ -90,84 +115,128 @@ class BspArchitecture { static_assert(is_computational_dag_v, "BspSchedule can only be used with computational DAGs."); private: - unsigned number_processors; - unsigned number_of_processor_types; + /** @brief The number of processors in the architecture. Must be at least 1. */ + unsigned numberOfProcessors_; + + /** @brief The number of processor types in the architecture. See processorTypes_ for more details. */ + unsigned numberOfProcessorTypes_; + + /** @brief The communication costs, typically denoted 'g' for the BSP model. */ + v_commw_t communicationCosts_; - v_commw_t communication_costs; - v_commw_t synchronisation_costs; + /** @brief The synchronisation costs, typically denoted 'L' for the BSP model. */ + v_commw_t synchronisationCosts_; - std::vector> memory_bound; + /** @brief The architecture allows to specify memory bounds per processor. */ + std::vector> memoryBound_; - bool isNuma; + /** @brief Flag to indicate whether the architecture is NUMA , i.e., whether the send costs are different for different pairs of processors. */ + bool isNuma_; - std::vector processor_type; + /** @brief The architecture allows to specify processor types. Processor types are used to express compatabilities, which can be specified in the BspInstance, regarding node types. */ + std::vector processorTypes_; - std::vector>> send_costs; + /** @brief A flattened p x p matrix of send costs. Access via index [i * numberOfProcessors_ + j]. */ + std::vector> sendCosts_; - MEMORY_CONSTRAINT_TYPE memory_const_type = MEMORY_CONSTRAINT_TYPE::NONE; + /** @brief The memory constraint type. */ + MEMORY_CONSTRAINT_TYPE memoryConstraintType_ = MEMORY_CONSTRAINT_TYPE::NONE; - bool are_send_cost_numa() { - if (number_processors == 1) + /** @brief Helper function to calculate the index of a flattened p x p matrix. */ + std::size_t FlatIndex(const unsigned row, const unsigned col) const { + return static_cast(row) * numberOfProcessors_ + col; + } + + bool AreSendCostsNuma() { + if (numberOfProcessors_ == 1U) return false; - v_commw_t val = send_costs[0][1]; - for (unsigned p1 = 0; p1 < number_processors; p1++) { - for (unsigned p2 = 0; p2 < number_processors; p2++) { + const v_commw_t val = sendCosts_[1U]; + for (unsigned p1 = 0U; p1 < numberOfProcessors_; p1++) { + for (unsigned p2 = 0U; p2 < numberOfProcessors_; p2++) { if (p1 == p2) continue; - if (send_costs[p1][p2] != val) + if (sendCosts_[FlatIndex(p1, p2)] != val) return true; } } return false; } + void UpdateNumberOfProcessorTypes() { + numberOfProcessorTypes_ = 0U; + for (unsigned p = 0U; p < numberOfProcessors_; p++) { + if (processorTypes_[p] >= numberOfProcessorTypes_) { + numberOfProcessorTypes_ = processorTypes_[p] + 1U; + } + } + } + + void SetSendCostDiagonalToZero() { + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + sendCosts_[FlatIndex(i, i)] = 0U; + } + } + + void InitializeUniformSendCosts() { + sendCosts_.assign(numberOfProcessors_ * numberOfProcessors_, 1U); + SetSendCostDiagonalToZero(); + isNuma_ = false; + } + public: + /** + * @brief Default constructor. + * Initializes a BSP architecture with 2 processors, 1 processor type, + * communication costs of 1, synchronisation costs of 2, memory bounds of 100, + * and send costs of 1 between all processors. + */ BspArchitecture() - : number_processors(2), number_of_processor_types(1), communication_costs(1), synchronisation_costs(2), - memory_bound(std::vector>(number_processors, 100)), isNuma(false), - processor_type(std::vector(number_processors, 0)), - send_costs(std::vector>>( - number_processors, std::vector>(number_processors, 1))) { - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; - } + : numberOfProcessors_(2U), numberOfProcessorTypes_(1U), communicationCosts_(1U), synchronisationCosts_(2U), + memoryBound_(numberOfProcessors_, 100U), isNuma_(false), + processorTypes_(numberOfProcessors_, 0U), sendCosts_(numberOfProcessors_ * numberOfProcessors_, 1U) { + SetSendCostDiagonalToZero(); } BspArchitecture(const BspArchitecture &other) = default; - BspArchitecture(BspArchitecture &&other) = default; + BspArchitecture(BspArchitecture &&other) noexcept = default; BspArchitecture &operator=(const BspArchitecture &other) = default; - BspArchitecture &operator=(BspArchitecture &&other) = default; - ~BspArchitecture() = default; + BspArchitecture &operator=(BspArchitecture &&other) noexcept = default; + virtual ~BspArchitecture() = default; /** * @brief Constructs a BspArchitecture object with the specified number of processors, communication cost, and * synchronization cost. * - * @param processors The number of processors in the architecture. - * @param comm_cost The communication cost between processors. - * @param synch_cost The synchronization cost between processors. - */ - BspArchitecture(unsigned processors, v_commw_t comm_cost, v_commw_t synch_cost, - v_memw_t memory_bound_ = 100) - : number_processors(processors), number_of_processor_types(1), communication_costs(comm_cost), - synchronisation_costs(synch_cost), - memory_bound(std::vector>(number_processors, memory_bound_)), isNuma(false), - processor_type(std::vector(number_processors, 0)), - send_costs(std::vector>>( - number_processors, std::vector>(number_processors, 1))) { - - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; + * @param NumberOfProcessors The number of processors in the architecture. Must be greater than 0. + * @param CommunicationCost The communication cost between processors. + * @param SynchronisationCost The synchronization cost between processors. + * @param MemoryBound The memory bound for each processor (default: 100). + */ + BspArchitecture(const unsigned NumberOfProcessors, const v_commw_t CommunicationCost, const v_commw_t SynchronisationCost, + const v_memw_t MemoryBound = 100U) + : numberOfProcessors_(NumberOfProcessors), numberOfProcessorTypes_(1U), communicationCosts_(CommunicationCost), + synchronisationCosts_(SynchronisationCost), + memoryBound_(NumberOfProcessors, MemoryBound), isNuma_(false), + processorTypes_(NumberOfProcessors, 0U), sendCosts_(NumberOfProcessors * NumberOfProcessors, 1U) { + if (NumberOfProcessors == 0U) { + throw std::runtime_error("BspArchitecture: Number of processors must be greater than 0."); } + SetSendCostDiagonalToZero(); } + /** + * @brief Copy constructor from a BspArchitecture with a different graph type. + * + * @tparam Graph_t_other The graph type of the other BspArchitecture. + * @param other The other BspArchitecture object. + */ template BspArchitecture(const BspArchitecture &other) - : number_processors(other.numberOfProcessors()), number_of_processor_types(other.getNumberOfProcessorTypes()), - communication_costs(other.communicationCosts()), synchronisation_costs(other.synchronisationCosts()), - memory_bound(other.memoryBound()), isNuma(other.isNumaArchitecture()), processor_type(other.processorTypes()), - send_costs(other.sendCosts()) { + : numberOfProcessors_(other.numberOfProcessors()), numberOfProcessorTypes_(other.getNumberOfProcessorTypes()), + communicationCosts_(other.communicationCosts()), synchronisationCosts_(other.synchronisationCosts()), + memoryBound_(other.memoryBound()), isNuma_(other.isNumaArchitecture()), processorTypes_(other.processorTypes()), + sendCosts_(other.sendCostsVector()) { static_assert(std::is_same_v, v_memw_t>, "BspArchitecture: Graph_t and Graph_t_other have the same memory weight type."); @@ -180,81 +249,87 @@ class BspArchitecture { } /** - * @brief Constructs a BspArchitecture object with the specified number of processors, communication cost, and - * synchronization cost. + * @brief Constructs a BspArchitecture object with custom send costs. * - * @param processors The number of processors in the architecture. - * @param comm_cost The communication cost between processors. - * @param synch_cost The synchronization cost between processors. - */ - BspArchitecture(unsigned int processors, v_commw_t comm_cost, v_commw_t synch_cost, - std::vector>> send_costs_) - : number_processors(processors), number_of_processor_types(1), communication_costs(comm_cost), - synchronisation_costs(synch_cost), memory_bound(std::vector>(number_processors, 100)), - processor_type(std::vector(number_processors, 0)), send_costs(send_costs_) { - - if (number_processors != send_costs.size()) { - throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n"); + * @param NumberOfProcessors The number of processors. Must be greater than 0. + * @param CommunicationCost The communication cost. + * @param SynchronisationCost The synchronization cost. + * @param SendCosts The matrix of send costs between processors. Needs to be a processors x processors matrix. Diagonal entries are forced to zero. + */ + BspArchitecture(const unsigned NumberOfProcessors, const v_commw_t CommunicationCost, const v_commw_t SynchronisationCost, + const std::vector>> &SendCosts) + : numberOfProcessors_(NumberOfProcessors), numberOfProcessorTypes_(1U), communicationCosts_(CommunicationCost), + synchronisationCosts_(SynchronisationCost), memoryBound_(NumberOfProcessors, 100U), + processorTypes_(NumberOfProcessors, 0U) { + if (NumberOfProcessors == 0U) { + throw std::runtime_error("BspArchitecture: Number of processors must be greater than 0."); + } + if (NumberOfProcessors != SendCosts.size()) { + throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n"); } - if (std::any_of(send_costs.begin(), send_costs.end(), - [processors](const auto &thing) { return thing.size() != processors; })) { - throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n"); + if (std::any_of(SendCosts.begin(), SendCosts.end(), + [NumberOfProcessors](const auto &thing) { return thing.size() != NumberOfProcessors; })) { + throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n"); } - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; + sendCosts_.reserve(NumberOfProcessors * NumberOfProcessors); + for (const auto &row : SendCosts) { + sendCosts_.insert(sendCosts_.end(), row.begin(), row.end()); } - isNuma = are_send_cost_numa(); + SetSendCostDiagonalToZero(); + isNuma_ = AreSendCostsNuma(); } /** - * @brief Constructs a BspArchitecture object with the specified number of processors, communication cost, and - * synchronization cost. + * @brief Constructs a BspArchitecture object with custom send costs and memory bound. * - * @param processors The number of processors in the architecture. - * @param comm_cost The communication cost between processors. - * @param synch_cost The synchronization cost between processors. - */ - BspArchitecture(unsigned int processors, v_commw_t comm_cost, v_commw_t synch_cost, - v_memw_t memory_bound_, std::vector>> send_costs_) - : number_processors(processors), number_of_processor_types(1), communication_costs(comm_cost), - synchronisation_costs(synch_cost), - memory_bound(std::vector>(number_processors, memory_bound_)), - processor_type(std::vector(number_processors, 0)), send_costs(send_costs_) { - - if (number_processors != send_costs.size()) { - throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n"); + * @param NumberOfProcessors The number of processors. Must be greater than 0. + * @param CommunicationCost The communication cost. + * @param SynchronisationCost The synchronization cost. + * @param MemoryBound The memory bound for each processor. + * @param SendCosts The matrix of send costs between processors. Needs to be a processors x processors matrix. Diagonal entries are forced to zero. + */ + BspArchitecture(const unsigned NumberOfProcessors, const v_commw_t CommunicationCost, const v_commw_t SynchronisationCost, + const v_memw_t MemoryBound, const std::vector>> &SendCosts) + : numberOfProcessors_(NumberOfProcessors), numberOfProcessorTypes_(1U), communicationCosts_(CommunicationCost), + synchronisationCosts_(SynchronisationCost), memoryBound_(NumberOfProcessors, MemoryBound), + processorTypes_(NumberOfProcessors, 0U) { + if (NumberOfProcessors == 0U) { + throw std::runtime_error("BspArchitecture: Number of processors must be greater than 0."); + } + if (NumberOfProcessors != SendCosts.size()) { + throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n"); } - if (std::any_of(send_costs.begin(), send_costs.end(), - [processors](const auto &thing) { return thing.size() != processors; })) { - throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n"); + if (std::any_of(SendCosts.begin(), SendCosts.end(), + [NumberOfProcessors](const auto &thing) { return thing.size() != NumberOfProcessors; })) { + throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n"); } - for (unsigned i = 0u; i < number_processors; i++) { - send_costs[i][i] = 0u; + sendCosts_.reserve(NumberOfProcessors * NumberOfProcessors); + for (const auto &row : SendCosts) { + sendCosts_.insert(sendCosts_.end(), row.begin(), row.end()); } - isNuma = are_send_cost_numa(); + SetSendCostDiagonalToZero(); + isNuma_ = AreSendCostsNuma(); } /** - * Sets the uniform send cost for each pair of processors in the BSP architecture. + * @brief Sets the uniform send cost for each pair of processors. * The send cost is set to 0 if the processors are the same, and 1 otherwise. - * This function assumes that the number of processors has already been set. */ void SetUniformSendCost() { - - for (unsigned i = 0; i < number_processors; i++) { - for (unsigned j = 0; j < number_processors; j++) { + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + for (unsigned j = 0U; j < numberOfProcessors_; j++) { if (i == j) { - send_costs[i][j] = 0; + sendCosts_[FlatIndex(i, j)] = 0U; } else { - send_costs[i][j] = 1; + sendCosts_[FlatIndex(i, j)] = 1U; } } } - isNuma = false; + isNuma_ = false; } /** @@ -265,80 +340,59 @@ class BspArchitecture { * * @param base The base value used to calculate the send cost. */ - void SetExpSendCost(v_commw_t base) { - - isNuma = true; + void SetExpSendCost(const v_commw_t base) { + isNuma_ = true; unsigned maxPos = 1; constexpr unsigned two = 2; - for (; intpow(two, maxPos + 1) <= number_processors - 1; ++maxPos) { + for (; intpow(two, maxPos + 1) <= numberOfProcessors_ - 1; ++maxPos) { } - for (unsigned i = 0; i < number_processors; ++i) - for (unsigned j = i + 1; j < number_processors; ++j) - for (unsigned pos = maxPos; pos <= maxPos; --pos) - if (((1 << pos) & i) != ((1 << pos) & j)) { - send_costs[i][j] = send_costs[j][i] = intpow(base, pos); + + for (unsigned i = 0U; i < numberOfProcessors_; ++i) { + for (unsigned j = i + 1U; j < numberOfProcessors_; ++j) { + // Corrected loop to avoid underflow issues with unsigned + for (int pos = static_cast(maxPos); pos >= 0; --pos) { + if (((1U << pos) & i) != ((1U << pos) & j)) { + sendCosts_[FlatIndex(i, j)] = sendCosts_[FlatIndex(j, i)] = intpow(base, static_cast(pos)); break; } + } + } + } } - inline auto processors() const { return integral_range(number_processors); } - /** - * @brief Computes the average communication cost of the BspArchitecture. - * - * This function computes the average communication cost of the BspArchitecture object. - * The average communication cost is calculated as the sum of the send costs between processors divided by the - * number of processors. - * - * @return The average communication cost as an unsigned integer. + * @brief Returns a view of processor indices from 0 to numberOfProcessors_ - 1. + * @return An integral view of processor indices. */ - v_commw_t computeCommAverage() const { - - double avg = 0; - for (unsigned i = 0; i < number_processors; ++i) - for (unsigned j = 0; j < number_processors; ++j) - avg += static_cast(send_costs[i][j]); - avg = avg * static_cast(communication_costs) / static_cast(number_processors) / static_cast(number_processors); - - if (avg > static_cast(std::numeric_limits::max())) { - throw std::invalid_argument("avg comm exceeds the limit (something is very wrong)"); - } - - return static_cast>(std::round(avg)); - } + [[nodiscard]] auto processors() const { return integral_range(numberOfProcessors_); } /** - * Sets the send costs for the BspArchitecture. + * @brief Sets the send costs for the BspArchitecture. * * @param vec A 2D vector representing the send costs between processors. - * The size of the vector must be equal to the number of processors. - * Each inner vector must also have a size equal to the number of processors. - * @throws std::invalid_argument if the size of the vector or inner vectors is invalid. + * @throws std::invalid_argument if the size of the vector is invalid or diagonal elements are not 0. */ - void setSendCosts(const std::vector>> &vec) { - - if (vec.size() != number_processors) { - throw std::invalid_argument("Invalid Argument"); + void SetSendCosts(const std::vector>> &vec) { + if (vec.size() != numberOfProcessors_) { + throw std::invalid_argument("Invalid Argument: Vector size mismatch."); } - isNuma = false; - for (unsigned i = 0; i < number_processors; i++) { - - if (vec[i].size() != number_processors) { - throw std::invalid_argument("Invalid Argument"); + isNuma_ = false; + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + if (vec.at(i).size() != numberOfProcessors_) { + throw std::invalid_argument("Invalid Argument: Inner vector size mismatch."); } - for (unsigned j = 0; j < number_processors; j++) { - + for (unsigned j = 0U; j < numberOfProcessors_; j++) { if (i == j) { - if (vec[i][j] != 0) - throw std::invalid_argument("Invalid Argument, Diagonal elements should be 0"); + if (vec.at(i).at(j) != 0U) + throw std::invalid_argument("Invalid Argument: Diagonal elements should be 0."); } else { - send_costs[i][j] = vec[i][j]; + sendCosts_.at(FlatIndex(i, j)) = vec.at(i).at(j); - if (number_processors > 1 && vec[i][j] != vec[0][1]) { - isNuma = true; + if (numberOfProcessors_ > 1U && vec.at(i).at(j) != vec.at(0U).at(1U)) { + isNuma_ = true; } } } @@ -346,324 +400,309 @@ class BspArchitecture { } /** - * Sets the send costs between two processors. + * @brief Sets the send costs between two processors. * - * @param p1 The index of the first processor. - * @param p2 The index of the second processor. + * @param p1 The index of the first processor. Must be less than numberOfProcessors_. + * @param p2 The index of the second processor. Must be less than numberOfProcessors_. * @param cost The cost of sending data between the processors. - * - * @remarks If the two processors are the same, the send cost is not set. - * If the cost is not equal to 1, the architecture is considered NUMA. + * @throws std::invalid_argument if the processor indices are out of bounds. */ - void setSendCosts(unsigned p1, unsigned p2, v_commw_t cost) { - - if (p1 >= number_processors || p2 > number_processors) - throw std::invalid_argument("Invalid Argument"); + void SetSendCosts(const unsigned p1, const unsigned p2, const v_commw_t cost) { + if (p1 >= numberOfProcessors_ || p2 >= numberOfProcessors_) // Fixed condition: p2 >= number_processors + throw std::invalid_argument("Invalid Argument: Processor index out of bounds."); if (p1 != p2) { - send_costs[p1][p2] = cost; - - isNuma = are_send_cost_numa(); + sendCosts_.at(FlatIndex(p1, p2)) = cost; + isNuma_ = AreSendCostsNuma(); } } /** - * Sets the memory bound for all processors of the BspArchitecture. - * - * @param memory_bound_ The new memory bound for all processors. + * @brief Sets the memory bound for all processors. + * @param MemoryBound The new memory bound for all processors. */ - inline void setMemoryBound(v_memw_t memory_bound_) { - memory_bound = std::vector>(number_processors, memory_bound_); + void setMemoryBound(const v_memw_t MemoryBound) { + memoryBound_.assign(numberOfProcessors_, MemoryBound); } - inline void setMemoryBound(const std::vector> &memory_bound_) { memory_bound = memory_bound_; } - - inline void setMemoryBound(v_memw_t memory_bound_, unsigned proc) { - - if (proc >= number_processors) { - throw std::invalid_argument("Invalid Argument setMemoryBound"); + /** + * @brief Sets the memory bound for all processors using a vector. + * @param MemoryBound The vector of memory bounds. + * @throws std::invalid_argument if the size of the vector is invalid. + */ + void setMemoryBound(const std::vector> &MemoryBound) { + if (MemoryBound.size() != numberOfProcessors_) { + throw std::invalid_argument("Invalid Argument: Memory bound vector size does not match number of processors."); } + memoryBound_ = MemoryBound; + } - memory_bound[proc] = memory_bound_; + /** + * @brief Sets the memory bound for a specific processor. + * @param MemoryBound The new memory bound for the processor. + * @param processorIndex The processor index. Must be less than numberOfProcessors_. + */ + void setMemoryBound(const v_memw_t MemoryBound, const unsigned processorIndex) { + memoryBound_.at(processorIndex) = MemoryBound; } /** - * @brief Sets the synchronization costs for the BspArchitecture. - * - * This function sets the synchronization costs for the BspArchitecture object. - * The synchronization costs represent the costs of establishing communication between processors. - * - * @param synch_cost The synchronization costs to be set. + * @brief Sets the synchronization costs. + * @param SynchCost The new synchronization costs. */ - inline void setSynchronisationCosts(v_commw_t synch_cost) { synchronisation_costs = synch_cost; } + void setSynchronisationCosts(const v_commw_t SynchCost) { synchronisationCosts_ = SynchCost; } /** - * @brief Sets the communication costs for the BspArchitecture. - * - * This function sets the communication costs for the BspArchitecture object. - * The communication costs represent the costs of sending messages between processors. - * - * @param comm_cost The communication costs to be set. + * @brief Sets the communication costs. + * @param CommCost The new communication costs. */ - inline void setCommunicationCosts(v_commw_t comm_cost) { communication_costs = comm_cost; } + void setCommunicationCosts(const v_commw_t CommCost) { communicationCosts_ = CommCost; } /** - * @brief Sets the number of processors in the BSP architecture. - * - * This function sets the number of processors in the BSP architecture and sets the send costs between processors - * to 1. The send_costs matrix represents the costs of sending messages between processors. The diagonal elements of - * the matrix are set to 0, indicating that there is no cost to send a message from a processor to itself. - * - * @param num_proc The number of processors in the BSP architecture. + * @brief Checks if the architecture is NUMA. + * @return True if NUMA, false otherwise. */ - void setNumberOfProcessors(unsigned num_proc) { + [[nodiscard]] bool isNumaArchitecture() const { return isNuma_; } - number_processors = num_proc; - number_of_processor_types = 1; - processor_type = std::vector(number_processors, 0); - send_costs = std::vector>>( - number_processors, std::vector>(number_processors, 1)); - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; + /** + * @brief Sets the number of processors. Processor type is set to 0 for all processors. + * Resets send costs to uniform (1) and diagonal to 0. The memory bound is set to 100 for all processors. + * @param numberOfProcessors The number of processors. Must be greater than 0. + * @throws std::invalid_argument if the number of processors is 0. + */ + void setNumberOfProcessors(const unsigned numberOfProcessors) { + if (numberOfProcessors == 0) { + throw std::invalid_argument("Invalid Argument: Number of processors must be greater than 0."); } - memory_bound.resize(num_proc, memory_bound.back()); + numberOfProcessors_ = numberOfProcessors; + numberOfProcessorTypes_ = 1U; + processorTypes_.assign(numberOfProcessors_, 0U); + + InitializeUniformSendCosts(); - isNuma = false; + // initialize memory bound to 100 for all processors + memoryBound_.assign(numberOfProcessors_, 100U); } /** - * @brief Sets the number of processors and their types in the BSP architecture. - * - * This function sets the number of processors in the BSP architecture and sets the send costs between processors - * to 1. The send_costs matrix represents the costs of sending messages between processors. The diagonal elements of - * the matrix are set to 0, indicating that there is no cost to send a message from a processor to itself. - * - * @param processor_types_ The type of the respective processors. + * @brief Sets the number of processors and their types. Number of processors is set to the size of the processor types vector. + * Resets send costs to uniform (1). Resets memory bound to 100 for all processors. + * @param processorTypes The types of the respective processors. */ - void setProcessorsWithTypes(const std::vector> &processor_types_) { - - if (processor_types_.size() > std::numeric_limits::max()) { - throw std::invalid_argument("Invalid Argument, number of processors exceeds the limit"); + void setProcessorsWithTypes(const std::vector> &processorTypes) { + if (processorTypes.empty()) { + throw std::invalid_argument("Invalid Argument: Processor types vector is empty."); } - - number_processors = static_cast(processor_types_.size()); - - number_of_processor_types = 0; - processor_type = processor_types_; - send_costs = std::vector>>( - number_processors, std::vector>(number_processors, 1)); - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; + if (processorTypes.size() > std::numeric_limits::max()) { + throw std::invalid_argument("Invalid Argument: Number of processors exceeds the limit."); } - memory_bound.resize(number_processors, memory_bound.back()); + numberOfProcessors_ = static_cast(processorTypes.size()); + processorTypes_ = processorTypes; + + InitializeUniformSendCosts(); - isNuma = false; - updateNumberOfProcessorTypes(); + // initialize memory bound to 100 for all processors + memoryBound_.assign(numberOfProcessors_, 100U); + UpdateNumberOfProcessorTypes(); } /** - * Returns whether the architecture is NUMA. - * - * @return True if the architecture is NUMA, false otherwise. + * @brief Sets processors based on counts of consecutive types. + * The architecture will have processorTypeCount[0] processors of type 0, processorTypeCount[1] processors of type 1, etc. + * The memory bound for each processor of type i is set to processorTypeMemory[i]. + * The send costs are set to uniform (1). + * @param processorTypeCount Vector where index is type and value is count of processors of that type. + * @param processorTypeMemory Vector where index is type and value is memory bound for that type. */ - inline bool isNumaArchitecture() const { return isNuma; } - - void set_processors_consequ_types(const std::vector> &processor_type_count_, - const std::vector> &processor_type_memory_) { - - if (processor_type_count_.size() != processor_type_memory_.size()) { - throw std::invalid_argument( - "Invalid Argument, processor_type_count_ and processor_type_memory_ must have the same size"); + void SetProcessorsConsequTypes(const std::vector> &processorTypeCount, + const std::vector> &processorTypeMemory) { + if (processorTypeCount.size() != processorTypeMemory.size()) { + throw std::invalid_argument("Invalid Argument: processorTypeCount and processorTypeMemory must have the same size."); } - if (processor_type_count_.size() > std::numeric_limits::max()) { - throw std::invalid_argument("Invalid Argument, number of processors exceeds the limit"); + if (processorTypeCount.size() > std::numeric_limits::max()) { + throw std::invalid_argument("Invalid Argument: Number of processors exceeds the limit."); } - number_of_processor_types = static_cast(processor_type_count_.size()); - number_processors = std::accumulate(processor_type_count_.begin(), processor_type_count_.end(), 0u); + numberOfProcessorTypes_ = static_cast(processorTypeCount.size()); + numberOfProcessors_ = std::accumulate(processorTypeCount.begin(), processorTypeCount.end(), 0U); - processor_type = std::vector>(number_processors, 0); - memory_bound = std::vector>(number_processors, 0); + // initialize processor types and memory bound + processorTypes_.assign(numberOfProcessors_, 0U); + memoryBound_.assign(numberOfProcessors_, 0U); - unsigned offset = 0; - for (unsigned i = 0; i < processor_type_count_.size(); i++) { - - for (unsigned j = 0; j < processor_type_count_[i]; j++) { - processor_type[offset + j] = i; - memory_bound[offset + j] = processor_type_memory_[i]; + unsigned offset = 0U; + for (unsigned i = 0U; i < processorTypeCount.size(); i++) { + for (unsigned j = 0U; j < processorTypeCount.at(i); j++) { + processorTypes_.at(offset + j) = i; + memoryBound_.at(offset + j) = processorTypeMemory.at(i); } - offset += processor_type_count_[i]; + offset += processorTypeCount.at(i); } - send_costs = std::vector>>( - number_processors, std::vector>(number_processors, 1)); - for (unsigned i = 0; i < number_processors; i++) { - send_costs[i][i] = 0; - } - isNuma = false; + InitializeUniformSendCosts(); } /** - * Returns the memory bound of the BspArchitecture. - * - * @return The memory bound as an unsigned integer. + * @brief Returns the memory bounds of all processors. + * @return Vector of memory bounds. */ - inline const std::vector> &memoryBound() const { return memory_bound; } + [[nodiscard]] const std::vector> &memoryBound() const { return memoryBound_; } - inline v_memw_t memoryBound(unsigned proc) const { return memory_bound[proc]; } + /** + * @brief Returns the memory bound of a specific processor. + * @param proc The processor index. + * @return The memory bound. + */ + [[nodiscard]] v_memw_t memoryBound(const unsigned proc) const { return memoryBound_[proc]; } - v_memw_t minMemoryBound() const { return *(std::min_element(memory_bound.begin(), memory_bound.end())); } - v_memw_t maxMemoryBound() const { return *(std::max_element(memory_bound.begin(), memory_bound.end())); } - v_memw_t sumMemoryBound() const { return std::accumulate(memory_bound.begin(), memory_bound.end(), 0); } + /** + * @brief Returns the maximum memory bound over all processors. + * @return The maximum memory bound. + */ + [[nodiscard]] v_memw_t maxMemoryBound() const { return *(std::max_element(memoryBound_.begin(), memoryBound_.end())); } - v_memw_t maxMemoryBoundProcType(v_type_t procType) const { - v_memw_t max_mem = 0; - for (unsigned proc = 0; proc < number_processors; proc++) { - if (processor_type[proc] == procType) { - max_mem = std::max(max_mem, memory_bound[proc]); + /** + * @brief Returns the maximum memory bound over all processors of a specific type. + * + * @param procType The processor type. + * @return The maximum memory bound. + */ + [[nodiscard]] v_memw_t maxMemoryBoundProcType(const v_type_t procType) const { + v_memw_t max_mem = 0U; + for (unsigned proc = 0U; proc < numberOfProcessors_; proc++) { + if (processorTypes_[proc] == procType) { + max_mem = std::max(max_mem, memoryBound_[proc]); } } return max_mem; } /** - * Returns the number of processors in the architecture. - * + * @brief Returns the number of processors. * @return The number of processors. */ - inline unsigned numberOfProcessors() const { return number_processors; } + [[nodiscard]] unsigned numberOfProcessors() const { return numberOfProcessors_; } /** - * Returns the communication costs of the BSP architecture. - * - * @return The communication costs as an unsigned integer. + * @brief Returns the communication costs. + * @return The communication costs. */ - inline v_commw_t communicationCosts() const { return communication_costs; } + [[nodiscard]] v_commw_t communicationCosts() const { return communicationCosts_; } /** - * Returns the synchronization costs of the BspArchitecture. - * - * @return The synchronization costs as an unsigned integer. + * @brief Returns the synchronization costs. + * @return The synchronization costs. */ - inline v_commw_t synchronisationCosts() const { return synchronisation_costs; } + [[nodiscard]] v_commw_t synchronisationCosts() const { return synchronisationCosts_; } /** - * Returns a copy of the send costs matrix. - * - * @return A copy of the send costs matrix. + * @brief Returns a the send costs matrix. Internally the matrix is stored as a flattened matrix. The allocates, computes and returns the matrix on the fly. + * @return The send costs matrix. */ - inline std::vector>> sendCostMatrixCopy() const { return send_costs; } + [[nodiscard]] std::vector>> sendCost() const { + std::vector>> matrix(numberOfProcessors_, std::vector>(numberOfProcessors_)); + for (unsigned i = 0; i < numberOfProcessors_; ++i) { + for (unsigned j = 0; j < numberOfProcessors_; ++j) { + matrix[i][j] = sendCosts_[FlatIndex(i, j)]; + } + } + return matrix; + } /** - * Returns a reference to the send costs matrix. - * - * @return A reference to the send costs matrix. + * @brief Returns the flattened send costs vector. + * @return The send costs vector. */ - inline const std::vector>> &sendCostMatrix() const { return send_costs; } + [[nodiscard]] const std::vector> &sendCostsVector() const { return sendCosts_; } - // the type indeces of the processor (e.g. CPU, vector/tensor core) - inline const std::vector &processorTypes() const { return processor_type; } + /** + * @brief Returns the processor types. + * @return Vector of processor types. + */ + [[nodiscard]] const std::vector &processorTypes() const { return processorTypes_; } /** - * Returns the communication costs between two processors. The communication costs are the send costs multiplied by - * the communication costs. + * @brief Returns the communication costs between two processors. Does not perform bounds checking. + * The communication costs are the send costs multiplied by the communication costs factor. * * @param p1 The index of the first processor. * @param p2 The index of the second processor. - * - * @return The send costs between the two processors. + * @return The communication costs between the two processors. */ - inline v_commw_t communicationCosts(unsigned p1, unsigned p2) const { - return communication_costs * send_costs[p1][p2]; + [[nodiscard]] v_commw_t communicationCosts(const unsigned p1, const unsigned p2) const { + return communicationCosts_ * sendCosts_[FlatIndex(p1, p2)]; } /** - * Returns the send costs between two processors. + * @brief Returns the send costs between two processors. Does not perform bounds checking. * * @param p1 The index of the first processor. * @param p2 The index of the second processor. - * * @return The send costs between the two processors. */ - inline v_commw_t sendCosts(unsigned p1, unsigned p2) const { return send_costs[p1][p2]; } - - inline auto sendCosts() const { return send_costs; } - - // the type index of the processor (e.g. CPU, vector/tensor core) - inline v_type_t processorType(unsigned p1) const { return processor_type[p1]; } + [[nodiscard]] v_commw_t sendCosts(const unsigned p1, const unsigned p2) const { return sendCosts_[FlatIndex(p1, p2)]; } - void setProcessorType(unsigned p1, v_type_t type) { - - if (p1 >= number_processors) - throw std::invalid_argument("Invalid Argument"); + /** + * @brief Returns the type of a specific processor. Does not perform bounds checking. + * @param p1 The processor index. + * @return The processor type. + */ + [[nodiscard]] v_type_t processorType(const unsigned p1) const { return processorTypes_[p1]; } - processor_type[p1] = type; - number_of_processor_types = std::max(number_of_processor_types, type + 1u); + /** + * @brief Sets the type of a specific processor. Performs bounds checking. + * @param p1 The processor index. + * @param type The new processor type. + */ + void setProcessorType(const unsigned p1, const v_type_t type) { + processorTypes_.at(p1) = type; + numberOfProcessorTypes_ = std::max(numberOfProcessorTypes_, type + 1U); } - std::vector getProcessorTypeCount() const { - - std::vector type_count(number_of_processor_types, 0u); - for (unsigned p = 0u; p < number_processors; p++) { - type_count[processor_type[p]]++; + /** + * @brief Returns the count of processors for each type. + * @return Vector where index is type and value is count. + */ + [[nodiscard]] std::vector getProcessorTypeCount() const { + std::vector type_count(numberOfProcessorTypes_, 0U); + for (unsigned p = 0U; p < numberOfProcessors_; p++) { + type_count[processorTypes_[p]]++; } return type_count; } - unsigned getMinProcessorTypeCount() const { - const auto &type_count = getProcessorTypeCount(); - if (type_count.empty()) { - return 0; - } - return *std::min_element(type_count.begin(), type_count.end()); - } - - void print_architecture(std::ostream &os) const { - - os << "Architectur info: number of processors: " << number_processors - << ", Number of processor types: " << number_of_processor_types - << ", Communication costs: " << communication_costs << ", Synchronization costs: " << synchronisation_costs - << std::endl; + /** + * @brief Prints the architecture details to the output stream. + * @param os The output stream. + */ + void print(std::ostream &os) const { + os << "Architecture info: number of processors: " << numberOfProcessors_ + << ", Number of processor types: " << numberOfProcessorTypes_ + << ", Communication costs: " << communicationCosts_ << ", Synchronization costs: " << synchronisationCosts_ + << "\n"; os << std::setw(17) << " Processor: "; - for (unsigned i = 0; i < number_processors; i++) { + for (unsigned i = 0U; i < numberOfProcessors_; i++) { os << std::right << std::setw(5) << i << " "; } - os << std::endl; + os << "\n"; os << std::setw(17) << "Processor type: "; - for (unsigned i = 0; i < number_processors; i++) { - os << std::right << std::setw(5) << processor_type[i] << " "; + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + os << std::right << std::setw(5) << processorTypes_.at(i) << " "; } - os << std::endl; + os << "\n"; os << std::setw(17) << "Memory bound: "; - for (unsigned i = 0; i < number_processors; i++) { - os << std::right << std::setw(5) << memory_bound[i] << " "; - } - os << std::endl; - } - - void updateNumberOfProcessorTypes() { - number_of_processor_types = 0; - for (unsigned p = 0; p < number_processors; p++) { - if (processor_type[p] >= number_of_processor_types) { - number_of_processor_types = processor_type[p] + 1; - } - } - } - - std::vector> getProcessorIdsByType() const { - std::vector> processor_ids_by_type(number_of_processor_types); - for (unsigned i = 0; i < numberOfProcessors(); ++i) { - processor_ids_by_type[processorType(i)].push_back(i); + for (unsigned i = 0U; i < numberOfProcessors_; i++) { + os << std::right << std::setw(5) << memoryBound_.at(i) << " "; } - return processor_ids_by_type; + os << "\n"; } - inline unsigned getNumberOfProcessorTypes() const { return number_of_processor_types; }; + [[nodiscard]] unsigned getNumberOfProcessorTypes() const { return numberOfProcessorTypes_; }; - inline MEMORY_CONSTRAINT_TYPE getMemoryConstraintType() const { return memory_const_type; } - inline void setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE memory_const_type_) { - memory_const_type = memory_const_type_; + [[nodiscard]] MEMORY_CONSTRAINT_TYPE getMemoryConstraintType() const { return memoryConstraintType_; } + void setMemoryConstraintType(const MEMORY_CONSTRAINT_TYPE memoryConstraintType) { + memoryConstraintType_ = memoryConstraintType; } }; diff --git a/include/osp/bsp/model/BspInstance.hpp b/include/osp/bsp/model/BspInstance.hpp index 4e31d145..7ab72fd4 100644 --- a/include/osp/bsp/model/BspInstance.hpp +++ b/include/osp/bsp/model/BspInstance.hpp @@ -27,7 +27,10 @@ limitations under the License. namespace osp { -enum class RETURN_STATUS { OSP_SUCCESS, BEST_FOUND, TIMEOUT, ERROR }; +enum class RETURN_STATUS { OSP_SUCCESS, + BEST_FOUND, + TIMEOUT, + ERROR }; inline std::string to_string(const RETURN_STATUS status) { switch (status) { @@ -44,13 +47,23 @@ inline std::string to_string(const RETURN_STATUS status) { } } -inline std::ostream& operator<<(std::ostream& os, RETURN_STATUS status) { +inline std::ostream &operator<<(std::ostream &os, RETURN_STATUS status) { switch (status) { - case RETURN_STATUS::OSP_SUCCESS: os << "SUCCESS"; break; - case RETURN_STATUS::BEST_FOUND: os << "BEST_FOUND"; break; - case RETURN_STATUS::TIMEOUT: os << "TIMEOUT"; break; - case RETURN_STATUS::ERROR: os << "ERROR"; break; - default: os << "UNKNOWN"; break; + case RETURN_STATUS::OSP_SUCCESS: + os << "SUCCESS"; + break; + case RETURN_STATUS::BEST_FOUND: + os << "BEST_FOUND"; + break; + case RETURN_STATUS::TIMEOUT: + os << "TIMEOUT"; + break; + case RETURN_STATUS::ERROR: + os << "ERROR"; + break; + default: + os << "UNKNOWN"; + break; } return os; } @@ -192,13 +205,21 @@ class BspInstance { /** * @brief Returns a copy of the send costs matrix. - * * @return A copy of the send costs matrix. */ - inline const std::vector>> &sendCostMatrix() const { + inline std::vector>> sendCostMatrix() const { return architecture.sendCostMatrix(); } + /** + * @brief Returns the flattened send costs vector. + * + * @return The flattened send costs vector. + */ + inline const std::vector> &sendCostsVector() const { + return architecture.sendCostsVector(); + } + /** * @brief Returns the communication costs of the BSP architecture. * @@ -389,53 +410,48 @@ class BspInstance { }; template -class compatible_processor_range { +class CompatibleProcessorRange { std::vector> type_processor_idx; const BspInstance *instance = nullptr; - public: + public: + CompatibleProcessorRange() = default; - compatible_processor_range() = default; - - compatible_processor_range(const BspInstance &inst) { + CompatibleProcessorRange(const BspInstance &inst) { initialize(inst); } - + inline void initialize(const BspInstance &inst) { instance = &inst; - if constexpr (has_typed_vertices_v) { - + if constexpr (has_typed_vertices_v) { + type_processor_idx = std::vector>(inst.getComputationalDag().num_vertex_types()); for (v_type_t v_type = 0; v_type < inst.getComputationalDag().num_vertex_types(); v_type++) { - for (unsigned proc = 0; proc < inst.numberOfProcessors(); proc++) - if (inst.isCompatibleType(v_type, inst.processorType(proc))) - type_processor_idx[v_type].push_back(proc); - + for (unsigned proc = 0; proc < inst.numberOfProcessors(); proc++) + if (inst.isCompatibleType(v_type, inst.processorType(proc))) + type_processor_idx[v_type].push_back(proc); } - } + } } - inline const auto & compatible_processors_type(v_type_t type) const { + inline const auto &compatible_processors_type(v_type_t type) const { assert(instance != nullptr); if constexpr (has_typed_vertices_v) { - return type_processor_idx[type]; + return type_processor_idx[type]; } else { return instance->processors(); } } - inline const auto & compatible_processors_vertex(vertex_idx_t vertex) const { + inline const auto &compatible_processors_vertex(vertex_idx_t vertex) const { return compatible_processors_type(instance->getComputationalDag().vertex_type(vertex)); } - - }; - } // namespace osp \ No newline at end of file diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp index 38fae9ff..b5b4ea95 100644 --- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp +++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp @@ -39,6 +39,8 @@ limitations under the License. namespace osp { +static constexpr unsigned CacheLineSize = 64; + template struct GrowLocalAutoCoresParallel_Params { vert_t minSuperstepSize = 20; diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp index f6c425bd..2cf0c631 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp @@ -97,7 +97,7 @@ struct kl_bsp_comm_cost_function { constexpr static bool is_max_comm_cost_function = true; kl_active_schedule *active_schedule; - compatible_processor_range *proc_range; + CompatibleProcessorRange *proc_range; const Graph_t *graph; const BspInstance *instance; @@ -119,7 +119,7 @@ struct kl_bsp_comm_cost_function { } void initialize(kl_active_schedule &sched, - compatible_processor_range &p_range) { + CompatibleProcessorRange &p_range) { active_schedule = &sched; proc_range = &p_range; instance = &sched.getInstance(); diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp index 50384c72..caaad9ca 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp @@ -24,24 +24,24 @@ limitations under the License. namespace osp { -template +template struct kl_hyper_total_comm_cost_function { - + using VertexType = vertex_idx_t; using kl_move = kl_move_struct; using kl_gain_update_info = kl_update_info; - + constexpr static unsigned window_range = 2 * window_size + 1; constexpr static bool is_max_comm_cost_function = false; kl_active_schedule *active_schedule; - compatible_processor_range *proc_range; + CompatibleProcessorRange *proc_range; const Graph_t *graph; const BspInstance *instance; - cost_t comm_multiplier = 1; + cost_t comm_multiplier = 1; cost_t max_comm_weight = 0; lambda_vector_container node_lambda_map; @@ -52,20 +52,20 @@ struct kl_hyper_total_comm_cost_function { const std::string name() const { return "toal_comm_cost"; } inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); } - void initialize(kl_active_schedule &sched, compatible_processor_range &p_range) { + void initialize(kl_active_schedule &sched, CompatibleProcessorRange &p_range) { active_schedule = &sched; proc_range = &p_range; instance = &sched.getInstance(); graph = &instance->getComputationalDag(); - comm_multiplier = 1.0 / instance->numberOfProcessors(); - node_lambda_map.initialize(graph->num_vertices(), instance->numberOfProcessors()); + comm_multiplier = 1.0 / instance->numberOfProcessors(); + node_lambda_map.initialize(graph->num_vertices(), instance->numberOfProcessors()); } struct empty_struct {}; using pre_move_comm_data_t = empty_struct; - inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); } + inline empty_struct get_pre_move_comm_data(const kl_move &) { return empty_struct(); } cost_t compute_schedule_cost() { cost_t work_costs = 0; @@ -74,7 +74,7 @@ struct kl_hyper_total_comm_cost_function { } cost_t comm_costs = 0; - for(const auto vertex : graph->vertices()) { + for (const auto vertex : graph->vertices()) { const unsigned vertex_proc = active_schedule->assigned_processor(vertex); const cost_t v_comm_cost = graph->vertex_comm_weight(vertex); max_comm_weight = std::max(max_comm_weight, v_comm_cost); @@ -87,7 +87,7 @@ struct kl_hyper_total_comm_cost_function { if (node_lambda_map.increase_proc_count(vertex, target_proc)) { comm_costs += v_comm_cost * instance->communicationCosts(vertex_proc, target_proc); // is 0 if target_proc == vertex_proc } - } + } } return work_costs + comm_costs * comm_multiplier + static_cast>(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); @@ -100,132 +100,132 @@ struct kl_hyper_total_comm_cost_function { } cost_t comm_costs = 0; - for(const auto vertex : graph->vertices()) { + for (const auto vertex : graph->vertices()) { const unsigned vertex_proc = active_schedule->assigned_processor(vertex); const cost_t v_comm_cost = graph->vertex_comm_weight(vertex); for (const auto lambdaproc_mult_pair : node_lambda_map.iterate_proc_entries(vertex)) { const auto &lambda_proc = lambdaproc_mult_pair.first; comm_costs += v_comm_cost * instance->communicationCosts(vertex_proc, lambda_proc); - } + } } return work_costs + comm_costs * comm_multiplier + static_cast>(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); } - inline void update_datastructure_after_move(const kl_move & move, const unsigned start_step, const unsigned end_step) { - if (move.to_proc != move.from_proc) { + inline void update_datastructure_after_move(const kl_move &move, const unsigned start_step, const unsigned end_step) { + if (move.to_proc != move.from_proc) { for (const auto &source : instance->getComputationalDag().parents(move.node)) { const unsigned source_step = active_schedule->assigned_superstep(source); if (source_step < start_step || source_step > end_step) continue; - update_source_after_move(move, source); + update_source_after_move(move, source); } } } - inline void update_source_after_move(const kl_move & move, VertexType source) { + inline void update_source_after_move(const kl_move &move, VertexType source) { node_lambda_map.decrease_proc_count(source, move.from_proc); node_lambda_map.increase_proc_count(source, move.to_proc); } template - void update_node_comm_affinity(const kl_move &move, thread_data_t& thread_data, const cost_t& penalty, const cost_t& reward, std::map & max_gain_recompute, std::vector &new_nodes) { - + void update_node_comm_affinity(const kl_move &move, thread_data_t &thread_data, const cost_t &penalty, const cost_t &reward, std::map &max_gain_recompute, std::vector &new_nodes) { + const unsigned start_step = thread_data.start_step; const unsigned end_step = thread_data.end_step; - + for (const auto &target : instance->getComputationalDag().children(move.node)) { - const unsigned target_step = active_schedule->assigned_superstep(target); + const unsigned target_step = active_schedule->assigned_superstep(target); if (target_step < start_step || target_step > end_step) continue; - if(thread_data.lock_manager.is_locked(target)) + if (thread_data.lock_manager.is_locked(target)) continue; if (not thread_data.affinity_table.is_selected(target)) { - new_nodes.push_back(target); + new_nodes.push_back(target); continue; } if (max_gain_recompute.find(target) != max_gain_recompute.end()) { - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } + } const unsigned target_proc = active_schedule->assigned_processor(target); - const unsigned target_start_idx = start_idx(target_step, start_step); + const unsigned target_start_idx = start_idx(target_step, start_step); auto &affinity_table = thread_data.affinity_table.at(target); if (move.from_step < target_step + (move.from_proc == target_proc)) { - const unsigned diff = target_step - move.from_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; - unsigned idx = target_start_idx; + const unsigned diff = target_step - move.from_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; + unsigned idx = target_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table[p][idx] -= penalty; - } - } + } + } if (idx - 1 < bound && is_compatible(target, move.from_proc)) { - affinity_table[move.from_proc][idx - 1] += penalty; + affinity_table[move.from_proc][idx - 1] += penalty; } } else { const unsigned diff = move.from_step - target_step; - const unsigned window_bound = end_idx(target_step, end_step); - unsigned idx = std::min(window_size + diff, window_bound); - - if (idx < window_bound && is_compatible(target, move.from_proc)) { - affinity_table[move.from_proc][idx] += reward; + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + + if (idx < window_bound && is_compatible(target, move.from_proc)) { + affinity_table[move.from_proc][idx] += reward; } idx++; - + for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table[p][idx] += reward; - } - } + } + } } if (move.to_step < target_step + (move.to_proc == target_proc)) { - unsigned idx = target_start_idx; - const unsigned diff = target_step - move.to_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; + unsigned idx = target_start_idx; + const unsigned diff = target_step - move.to_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table[p][idx] += penalty; - } - } + } + } if (idx - 1 < bound && is_compatible(target, move.to_proc)) { - affinity_table[move.to_proc][idx - 1] -= penalty; + affinity_table[move.to_proc][idx - 1] -= penalty; } } else { const unsigned diff = move.to_step - target_step; - const unsigned window_bound = end_idx(target_step, end_step); - unsigned idx = std::min(window_size + diff, window_bound); - + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + if (idx < window_bound && is_compatible(target, move.to_proc)) { - affinity_table[move.to_proc][idx] -= reward; + affinity_table[move.to_proc][idx] -= reward; } idx++; - + for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table[p][idx] -= reward; - } - } + } + } } - if (move.to_proc != move.from_proc) { + if (move.to_proc != move.from_proc) { const cost_t comm_gain = graph->vertex_comm_weight(move.node) * comm_multiplier; - + const unsigned window_bound = end_idx(target_step, end_step); - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { if (p == target_proc) continue; if (node_lambda_map.get_proc_entry(move.node, target_proc) == 1) { @@ -233,144 +233,143 @@ struct kl_hyper_total_comm_cost_function { const cost_t x = instance->communicationCosts(move.from_proc, target_proc) * comm_gain; const cost_t y = instance->communicationCosts(move.to_proc, target_proc) * comm_gain; affinity_table[p][idx] += x - y; - } + } } if (node_lambda_map.has_no_proc_entry(move.node, p)) { for (unsigned idx = target_start_idx; idx < window_bound; idx++) { const cost_t x = instance->communicationCosts(move.from_proc, p) * comm_gain; const cost_t y = instance->communicationCosts(move.to_proc, p) * comm_gain; - affinity_table[p][idx] -= x - y; + affinity_table[p][idx] -= x - y; } - } + } } - } + } } - for (const auto &source : instance->getComputationalDag().parents(move.node)) { + for (const auto &source : instance->getComputationalDag().parents(move.node)) { if (move.to_proc != move.from_proc) { - const unsigned source_proc = active_schedule->assigned_processor(source); - if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) { + const unsigned source_proc = active_schedule->assigned_processor(source); + if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; for (const auto &target : instance->getComputationalDag().children(source)) { const unsigned target_step = active_schedule->assigned_superstep(target); - if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) - continue; + if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) + continue; - if (source_proc != move.from_proc && is_compatible(target, move.from_proc)) { + if (source_proc != move.from_proc && is_compatible(target, move.from_proc)) { if (max_gain_recompute.find(target) != max_gain_recompute.end()) { // todo more specialized update - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } + } - auto & affinity_table_target_from_proc = thread_data.affinity_table.at(target)[move.from_proc]; + auto &affinity_table_target_from_proc = thread_data.affinity_table.at(target)[move.from_proc]; const unsigned target_window_bound = end_idx(target_step, end_step); const cost_t comm_aff = instance->communicationCosts(source_proc, move.from_proc) * comm_gain; for (unsigned idx = start_idx(target_step, start_step); idx < target_window_bound; idx++) { affinity_table_target_from_proc[idx] += comm_aff; } } - } - } else if (node_lambda_map.get_proc_entry(source, move.from_proc) == 1) { + } + } else if (node_lambda_map.get_proc_entry(source, move.from_proc) == 1) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; for (const auto &target : instance->getComputationalDag().children(source)) { const unsigned target_step = active_schedule->assigned_superstep(target); - if ((target_step < start_step || target_step > end_step) || (target == move.node) || thread_data.lock_manager.is_locked(target) || (not thread_data.affinity_table.is_selected(target))) - continue; + if ((target_step < start_step || target_step > end_step) || (target == move.node) || thread_data.lock_manager.is_locked(target) || (not thread_data.affinity_table.is_selected(target))) + continue; const unsigned target_proc = active_schedule->assigned_processor(target); - if (target_proc == move.from_proc) { + if (target_proc == move.from_proc) { if (max_gain_recompute.find(target) != max_gain_recompute.end()) { // todo more specialized update - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } - + } + const unsigned target_start_idx = start_idx(target_step, start_step); const unsigned target_window_bound = end_idx(target_step, end_step); - auto & affinity_table_target = thread_data.affinity_table.at(target); + auto &affinity_table_target = thread_data.affinity_table.at(target); const cost_t comm_aff = instance->communicationCosts(source_proc, target_proc) * comm_gain; for (const unsigned p : proc_range->compatible_processors_vertex(target)) { if (p == target_proc) - continue; - + continue; + for (unsigned idx = target_start_idx; idx < target_window_bound; idx++) { affinity_table_target[p][idx] -= comm_aff; - } + } } break; // since node_lambda_map[source][move.from_proc] == 1 - } - } + } + } } if (node_lambda_map.get_proc_entry(source, move.to_proc) == 1) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - + for (const auto &target : instance->getComputationalDag().children(source)) { const unsigned target_step = active_schedule->assigned_superstep(target); - if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) - continue; - + if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) + continue; + if (source_proc != move.to_proc && is_compatible(target, move.to_proc)) { if (max_gain_recompute.find(target) != max_gain_recompute.end()) { - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } - + } + const unsigned target_window_bound = end_idx(target_step, end_step); - auto & affinity_table_target_to_proc = thread_data.affinity_table.at(target)[move.to_proc]; + auto &affinity_table_target_to_proc = thread_data.affinity_table.at(target)[move.to_proc]; const cost_t comm_aff = instance->communicationCosts(source_proc, move.to_proc) * comm_gain; for (unsigned idx = start_idx(target_step, start_step); idx < target_window_bound; idx++) { affinity_table_target_to_proc[idx] -= comm_aff; - } + } } } - } else if (node_lambda_map.get_proc_entry(source, move.to_proc) == 2) { + } else if (node_lambda_map.get_proc_entry(source, move.to_proc) == 2) { for (const auto &target : instance->getComputationalDag().children(source)) { const unsigned target_step = active_schedule->assigned_superstep(target); - if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) - continue; - + if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target)) + continue; + const unsigned target_proc = active_schedule->assigned_processor(target); if (target_proc == move.to_proc) { if (source_proc != target_proc) { if (max_gain_recompute.find(target) != max_gain_recompute.end()) { - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } - + } + const unsigned target_start_idx = start_idx(target_step, start_step); const unsigned target_window_bound = end_idx(target_step, end_step); - auto & affinity_table_target = thread_data.affinity_table.at(target); + auto &affinity_table_target = thread_data.affinity_table.at(target); const cost_t comm_aff = instance->communicationCosts(source_proc, target_proc) * graph->vertex_comm_weight(source) * comm_multiplier; for (const unsigned p : proc_range->compatible_processors_vertex(target)) { if (p == target_proc) - continue; - + continue; + for (unsigned idx = target_start_idx; idx < target_window_bound; idx++) { affinity_table_target[p][idx] += comm_aff; - } + } } } break; - } - } - } + } + } + } } - - const unsigned source_step = active_schedule->assigned_superstep(source); + const unsigned source_step = active_schedule->assigned_superstep(source); if (source_step < start_step || source_step > end_step) continue; - if(thread_data.lock_manager.is_locked(source)) - continue; + if (thread_data.lock_manager.is_locked(source)) + continue; if (not thread_data.affinity_table.is_selected(source)) { new_nodes.push_back(source); @@ -378,111 +377,111 @@ struct kl_hyper_total_comm_cost_function { } if (max_gain_recompute.find(source) != max_gain_recompute.end()) { - max_gain_recompute[source].full_update = true; + max_gain_recompute[source].full_update = true; } else { max_gain_recompute[source] = kl_gain_update_info(source, true); - } + } - const unsigned source_proc = active_schedule->assigned_processor(source); + const unsigned source_proc = active_schedule->assigned_processor(source); const unsigned source_start_idx = start_idx(source_step, start_step); const unsigned window_bound = end_idx(source_step, end_step); - auto & affinity_table_source = thread_data.affinity_table.at(source); + auto &affinity_table_source = thread_data.affinity_table.at(source); if (move.from_step < source_step + (move.from_proc != source_proc)) { - const unsigned diff = source_step - move.from_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = source_step - move.from_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = source_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] += reward; - } + } } if (window_size >= diff && is_compatible(source, move.from_proc)) { - affinity_table_source[move.from_proc][idx] += reward; + affinity_table_source[move.from_proc][idx] += reward; } - } else { + } else { const unsigned diff = move.from_step - source_step; - unsigned idx = window_size + diff; - + unsigned idx = window_size + diff; + if (idx < window_bound && is_compatible(source, move.from_proc)) { - affinity_table_source[move.from_proc][idx] += penalty; + affinity_table_source[move.from_proc][idx] += penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] -= penalty; - } - } + } + } } if (move.to_step < source_step + (move.to_proc != source_proc)) { - const unsigned diff = source_step - move.to_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = source_step - move.to_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = source_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] -= reward; - } + } } if (window_size >= diff && is_compatible(source, move.to_proc)) { - affinity_table_source[move.to_proc][idx] -= reward; + affinity_table_source[move.to_proc][idx] -= reward; } - } else { + } else { const unsigned diff = move.to_step - source_step; - unsigned idx = window_size + diff; + unsigned idx = window_size + diff; if (idx < window_bound && is_compatible(source, move.to_proc)) { - affinity_table_source[move.to_proc][idx] -= penalty; + affinity_table_source[move.to_proc][idx] -= penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] += penalty; - } - } - } - - if (move.to_proc != move.from_proc) { - if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) { + } + } + } + + if (move.to_proc != move.from_proc) { + if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { if (p == source_proc) continue; const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, move.from_proc), instance->communicationCosts(source_proc, move.from_proc), comm_gain); for (unsigned idx = source_start_idx; idx < window_bound; idx++) { affinity_table_source[p][idx] -= comm_cost; - } - } - } + } + } + } if (node_lambda_map.get_proc_entry(source, move.to_proc) == 1) { const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { if (p == source_proc) continue; const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, move.to_proc), instance->communicationCosts(source_proc, move.to_proc), comm_gain); for (unsigned idx = source_start_idx; idx < window_bound; idx++) { affinity_table_source[p][idx] += comm_cost; - } + } } - } - } - } + } + } + } } inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return node_step < window_size + start_step ? window_size - (node_step - start_step) : 0; } - inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return node_step + window_size <= end_step ? window_range : window_range - (node_step + window_size - end_step); } - inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0;} + inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return node_step + window_size <= end_step ? window_range : window_range - (node_step + window_size - end_step); } + inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0; } template - void compute_comm_affinity(VertexType node, affinity_table_t& affinity_table_node, const cost_t& penalty, const cost_t& reward, const unsigned start_step, const unsigned end_step) { + void compute_comm_affinity(VertexType node, affinity_table_t &affinity_table_node, const cost_t &penalty, const cost_t &reward, const unsigned start_step, const unsigned end_step) { const unsigned node_step = active_schedule->assigned_superstep(node); const unsigned node_proc = active_schedule->assigned_processor(node); const unsigned window_bound = end_idx(node_step, end_step); @@ -490,42 +489,42 @@ struct kl_hyper_total_comm_cost_function { for (const auto &target : instance->getComputationalDag().children(node)) { const unsigned target_step = active_schedule->assigned_superstep(target); - const unsigned target_proc = active_schedule->assigned_processor(target); + const unsigned target_proc = active_schedule->assigned_processor(target); if (target_step < node_step + (target_proc != node_proc)) { - const unsigned diff = node_step - target_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = node_step - target_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = node_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] -= reward; - } + } } if (window_size >= diff && is_compatible(node, target_proc)) { - affinity_table_node[target_proc][idx] -= reward; - } + affinity_table_node[target_proc][idx] -= reward; + } - } else { + } else { const unsigned diff = target_step - node_step; unsigned idx = window_size + diff; if (idx < window_bound && is_compatible(node, target_proc)) { - affinity_table_node[target_proc][idx] -= penalty; + affinity_table_node[target_proc][idx] -= penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] += penalty; - } - } - } + } + } + } } // traget const cost_t comm_gain = graph->vertex_comm_weight(node) * comm_multiplier; - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { if (p == node_proc) continue; @@ -540,21 +539,21 @@ struct kl_hyper_total_comm_cost_function { for (const auto &source : instance->getComputationalDag().parents(node)) { const unsigned source_step = active_schedule->assigned_superstep(source); - const unsigned source_proc = active_schedule->assigned_processor(source); + const unsigned source_proc = active_schedule->assigned_processor(source); if (source_step < node_step + (source_proc == node_proc)) { - const unsigned diff = node_step - source_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; + const unsigned diff = node_step - source_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; unsigned idx = node_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { - affinity_table_node[p][idx] += penalty; - } + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + affinity_table_node[p][idx] += penalty; + } } if (idx - 1 < bound && is_compatible(node, source_proc)) { - affinity_table_node[source_proc][idx - 1] -= penalty; + affinity_table_node[source_proc][idx - 1] -= penalty; } } else { @@ -562,34 +561,34 @@ struct kl_hyper_total_comm_cost_function { unsigned idx = std::min(window_size + diff, window_bound); if (idx < window_bound && is_compatible(node, source_proc)) { - affinity_table_node[source_proc][idx] -= reward; - } + affinity_table_node[source_proc][idx] -= reward; + } idx++; for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] -= reward; - } - } + } + } } const cost_t source_comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { if (p == node_proc) continue; if (source_proc != node_proc && node_lambda_map.get_proc_entry(source, node_proc) == 1) { for (unsigned idx = node_start_idx; idx < window_bound; idx++) { affinity_table_node[p][idx] -= instance->communicationCosts(source_proc, node_proc) * source_comm_gain; - } + } } if (source_proc != p && node_lambda_map.has_no_proc_entry(source, p)) { for (unsigned idx = node_start_idx; idx < window_bound; idx++) { affinity_table_node[p][idx] += instance->communicationCosts(source_proc, p) * source_comm_gain; } - } + } } } // source } diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp index be7c627c..5f471077 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp @@ -22,26 +22,26 @@ limitations under the License. #include "../kl_improver.hpp" namespace osp { -template +template struct kl_total_comm_cost_function { - + using VertexType = vertex_idx_t; using kl_move = kl_move_struct; using kl_gain_update_info = kl_update_info; - + constexpr static bool is_max_comm_cost_function = false; constexpr static unsigned window_range = 2 * window_size + 1; constexpr static bool use_node_communication_costs = use_node_communication_costs_arg || not has_edge_weights_v; - + kl_active_schedule *active_schedule; - compatible_processor_range *proc_range; + CompatibleProcessorRange *proc_range; const Graph_t *graph; const BspInstance *instance; - cost_t comm_multiplier = 1; + cost_t comm_multiplier = 1; cost_t max_comm_weight = 0; inline cost_t get_comm_multiplier() { return comm_multiplier; } @@ -52,23 +52,23 @@ struct kl_total_comm_cost_function { inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); } - void initialize(kl_active_schedule &sched, compatible_processor_range &p_range) { + void initialize(kl_active_schedule &sched, CompatibleProcessorRange &p_range) { active_schedule = &sched; proc_range = &p_range; instance = &sched.getInstance(); graph = &instance->getComputationalDag(); - comm_multiplier = 1.0 / instance->numberOfProcessors(); + comm_multiplier = 1.0 / instance->numberOfProcessors(); } struct empty_struct {}; using pre_move_comm_data_t = empty_struct; - inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); } + inline empty_struct get_pre_move_comm_data(const kl_move &) { return empty_struct(); } cost_t compute_schedule_cost_test() { return compute_schedule_cost(); } - void update_datastructure_after_move(const kl_move&, const unsigned, const unsigned) {} + void update_datastructure_after_move(const kl_move &, const unsigned, const unsigned) {} cost_t compute_schedule_cost() { @@ -89,7 +89,7 @@ struct kl_total_comm_cost_function { if (source_proc != target_proc) { if constexpr (use_node_communication_costs) { - const cost_t source_comm_cost = graph->vertex_comm_weight(source_v); + const cost_t source_comm_cost = graph->vertex_comm_weight(source_v); max_comm_weight = std::max(max_comm_weight, source_comm_cost); comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc); } else { @@ -98,108 +98,108 @@ struct kl_total_comm_cost_function { comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc); } } - } + } return work_costs + comm_costs * comm_multiplier + static_cast>(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); } template - void update_node_comm_affinity(const kl_move &move, thread_data_t& thread_data, const cost_t& penalty, const cost_t& reward, std::map & max_gain_recompute, std::vector &new_nodes) { - - const unsigned & start_step = thread_data.start_step; - const unsigned & end_step = thread_data.end_step; + void update_node_comm_affinity(const kl_move &move, thread_data_t &thread_data, const cost_t &penalty, const cost_t &reward, std::map &max_gain_recompute, std::vector &new_nodes) { + + const unsigned &start_step = thread_data.start_step; + const unsigned &end_step = thread_data.end_step; for (const auto &target : instance->getComputationalDag().children(move.node)) { - const unsigned target_step = active_schedule->assigned_superstep(target); + const unsigned target_step = active_schedule->assigned_superstep(target); if (target_step < start_step || target_step > end_step) continue; - if(thread_data.lock_manager.is_locked(target)) + if (thread_data.lock_manager.is_locked(target)) continue; if (not thread_data.affinity_table.is_selected(target)) { - new_nodes.push_back(target); + new_nodes.push_back(target); continue; } if (max_gain_recompute.find(target) != max_gain_recompute.end()) { - max_gain_recompute[target].full_update = true; + max_gain_recompute[target].full_update = true; } else { max_gain_recompute[target] = kl_gain_update_info(target, true); - } + } const unsigned target_proc = active_schedule->assigned_processor(target); - const unsigned target_start_idx = start_idx(target_step, start_step); - auto & affinity_table_target = thread_data.affinity_table.at(target); + const unsigned target_start_idx = start_idx(target_step, start_step); + auto &affinity_table_target = thread_data.affinity_table.at(target); if (move.from_step < target_step + (move.from_proc == target_proc)) { - const unsigned diff = target_step - move.from_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; - unsigned idx = target_start_idx; + const unsigned diff = target_step - move.from_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; + unsigned idx = target_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table_target[p][idx] -= penalty; - } - } + } + } if (idx - 1 < bound && is_compatible(target, move.from_proc)) { - affinity_table_target[move.from_proc][idx - 1] += penalty; + affinity_table_target[move.from_proc][idx - 1] += penalty; } } else { const unsigned diff = move.from_step - target_step; - const unsigned window_bound = end_idx(target_step, end_step); - unsigned idx = std::min(window_size + diff, window_bound); - - if (idx < window_bound && is_compatible(target, move.from_proc)) { - affinity_table_target[move.from_proc][idx] += reward; + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + + if (idx < window_bound && is_compatible(target, move.from_proc)) { + affinity_table_target[move.from_proc][idx] += reward; } idx++; - + for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table_target[p][idx] += reward; - } - } + } + } } if (move.to_step < target_step + (move.to_proc == target_proc)) { - unsigned idx = target_start_idx; - const unsigned diff = target_step - move.to_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; + unsigned idx = target_start_idx; + const unsigned diff = target_step - move.to_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table_target[p][idx] += penalty; - } - } + } + } if (idx - 1 < bound && is_compatible(target, move.to_proc)) { - affinity_table_target[move.to_proc][idx - 1] -= penalty; + affinity_table_target[move.to_proc][idx - 1] -= penalty; } } else { const unsigned diff = move.to_step - target_step; - const unsigned window_bound = end_idx(target_step, end_step); - unsigned idx = std::min(window_size + diff, window_bound); - + const unsigned window_bound = end_idx(target_step, end_step); + unsigned idx = std::min(window_size + diff, window_bound); + if (idx < window_bound && is_compatible(target, move.to_proc)) { - affinity_table_target[move.to_proc][idx] -= reward; + affinity_table_target[move.to_proc][idx] -= reward; } idx++; - + for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(target)) { + for (const unsigned p : proc_range->compatible_processors_vertex(target)) { affinity_table_target[p][idx] -= reward; - } - } + } + } } - - if (move.to_proc != move.from_proc) { + + if (move.to_proc != move.from_proc) { const auto from_proc_target_comm_cost = instance->communicationCosts(move.from_proc, target_proc); const auto to_proc_target_comm_cost = instance->communicationCosts(move.to_proc, target_proc); @@ -209,21 +209,21 @@ struct kl_total_comm_cost_function { const unsigned window_bound = end_idx(target_step, end_step); for (; idx < window_bound; idx++) { for (const unsigned p : proc_range->compatible_processors_vertex(target)) { - const auto x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_target_comm_cost, comm_gain); + const auto x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_target_comm_cost, comm_gain); const auto y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_target_comm_cost, comm_gain); - affinity_table_target[p][idx] += x - y; + affinity_table_target[p][idx] += x - y; } } - } + } } for (const auto &source : instance->getComputationalDag().parents(move.node)) { - const unsigned source_step = active_schedule->assigned_superstep(source); + const unsigned source_step = active_schedule->assigned_superstep(source); if (source_step < start_step || source_step > end_step) continue; - if(thread_data.lock_manager.is_locked(source)) + if (thread_data.lock_manager.is_locked(source)) continue; if (not thread_data.affinity_table.is_selected(source)) { @@ -232,75 +232,75 @@ struct kl_total_comm_cost_function { } if (max_gain_recompute.find(source) != max_gain_recompute.end()) { - max_gain_recompute[source].full_update = true; + max_gain_recompute[source].full_update = true; } else { max_gain_recompute[source] = kl_gain_update_info(source, true); - } + } const unsigned source_proc = active_schedule->assigned_processor(source); const unsigned window_bound = end_idx(source_step, end_step); - auto & affinity_table_source = thread_data.affinity_table.at(source); + auto &affinity_table_source = thread_data.affinity_table.at(source); if (move.from_step < source_step + (move.from_proc != source_proc)) { - const unsigned diff = source_step - move.from_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = source_step - move.from_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = start_idx(source_step, start_step); for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] += reward; - } + } } if (window_size >= diff && is_compatible(source, move.from_proc)) { - affinity_table_source[move.from_proc][idx] += reward; + affinity_table_source[move.from_proc][idx] += reward; } - } else { + } else { const unsigned diff = move.from_step - source_step; - unsigned idx = window_size + diff; - + unsigned idx = window_size + diff; + if (idx < window_bound && is_compatible(source, move.from_proc)) { - affinity_table_source[move.from_proc][idx] += penalty; + affinity_table_source[move.from_proc][idx] += penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] -= penalty; - } - } + } + } } if (move.to_step < source_step + (move.to_proc != source_proc)) { - const unsigned diff = source_step - move.to_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = source_step - move.to_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = start_idx(source_step, start_step); for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] -= reward; - } + } } if (window_size >= diff && is_compatible(source, move.to_proc)) { - affinity_table_source[move.to_proc][idx] -= reward; + affinity_table_source[move.to_proc][idx] -= reward; } - } else { + } else { const unsigned diff = move.to_step - source_step; - unsigned idx = window_size + diff; + unsigned idx = window_size + diff; if (idx < window_bound && is_compatible(source, move.to_proc)) { - affinity_table_source[move.to_proc][idx] -= penalty; + affinity_table_source[move.to_proc][idx] -= penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { affinity_table_source[p][idx] += penalty; - } - } - } + } + } + } - if (move.to_proc != move.from_proc) { + if (move.to_proc != move.from_proc) { const auto from_proc_source_comm_cost = instance->communicationCosts(source_proc, move.from_proc); const auto to_proc_source_comm_cost = instance->communicationCosts(source_proc, move.to_proc); @@ -308,23 +308,23 @@ struct kl_total_comm_cost_function { unsigned idx = start_idx(source_step, start_step); for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(source)) { - const cost_t x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_source_comm_cost, comm_gain); + for (const unsigned p : proc_range->compatible_processors_vertex(source)) { + const cost_t x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_source_comm_cost, comm_gain); const cost_t y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_source_comm_cost, comm_gain); - affinity_table_source[p][idx] += x - y; + affinity_table_source[p][idx] += x - y; } } } - } + } } inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0; } inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return (node_step + window_size <= end_step) ? window_range : window_range - (node_step + window_size - end_step); } - inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0;} + inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0; } template - void compute_comm_affinity(VertexType node, affinity_table_t& affinity_table_node, const cost_t& penalty, const cost_t& reward, const unsigned start_step, const unsigned end_step) { + void compute_comm_affinity(VertexType node, affinity_table_t &affinity_table_node, const cost_t &penalty, const cost_t &reward, const unsigned start_step, const unsigned end_step) { const unsigned node_step = active_schedule->assigned_superstep(node); const unsigned node_proc = active_schedule->assigned_processor(node); const unsigned window_bound = end_idx(node_step, end_step); @@ -332,37 +332,37 @@ struct kl_total_comm_cost_function { for (const auto &target : instance->getComputationalDag().children(node)) { const unsigned target_step = active_schedule->assigned_superstep(target); - const unsigned target_proc = active_schedule->assigned_processor(target); + const unsigned target_proc = active_schedule->assigned_processor(target); if (target_step < node_step + (target_proc != node_proc)) { - const unsigned diff = node_step - target_step; - const unsigned bound = window_size > diff ? window_size - diff : 0; + const unsigned diff = node_step - target_step; + const unsigned bound = window_size > diff ? window_size - diff : 0; unsigned idx = node_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] -= reward; - } + } } if (window_size >= diff && is_compatible(node, target_proc)) { - affinity_table_node[target_proc][idx] -= reward; - } + affinity_table_node[target_proc][idx] -= reward; + } - } else { + } else { const unsigned diff = target_step - node_step; unsigned idx = window_size + diff; if (idx < window_bound && is_compatible(node, target_proc)) { - affinity_table_node[target_proc][idx] -= penalty; + affinity_table_node[target_proc][idx] -= penalty; } for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] += penalty; - } - } - } + } + } + } const cost_t comm_gain = graph->vertex_comm_weight(node) * comm_multiplier; const auto node_target_comm_cost = instance->communicationCosts(node_proc, target_proc); @@ -378,21 +378,21 @@ struct kl_total_comm_cost_function { for (const auto &source : instance->getComputationalDag().parents(node)) { const unsigned source_step = active_schedule->assigned_superstep(source); - const unsigned source_proc = active_schedule->assigned_processor(source); + const unsigned source_proc = active_schedule->assigned_processor(source); if (source_step < node_step + (source_proc == node_proc)) { - const unsigned diff = node_step - source_step; - const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; + const unsigned diff = node_step - source_step; + const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0; unsigned idx = node_start_idx; for (; idx < bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { - affinity_table_node[p][idx] += penalty; - } + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + affinity_table_node[p][idx] += penalty; + } } if (idx - 1 < bound && is_compatible(node, source_proc)) { - affinity_table_node[source_proc][idx - 1] -= penalty; + affinity_table_node[source_proc][idx - 1] -= penalty; } } else { @@ -400,22 +400,22 @@ struct kl_total_comm_cost_function { unsigned idx = std::min(window_size + diff, window_bound); if (idx < window_bound && is_compatible(node, source_proc)) { - affinity_table_node[source_proc][idx] -= reward; + affinity_table_node[source_proc][idx] -= reward; } - + idx++; for (; idx < window_bound; idx++) { - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { affinity_table_node[p][idx] -= reward; - } - } + } + } } const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; const auto source_node_comm_cost = instance->communicationCosts(source_proc, node_proc); - for (const unsigned p : proc_range->compatible_processors_vertex(node)) { + for (const unsigned p : proc_range->compatible_processors_vertex(node)) { const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, source_proc), source_node_comm_cost, comm_gain); for (unsigned idx = node_start_idx; idx < window_bound; idx++) { affinity_table_node[p][idx] += comm_cost; @@ -426,4 +426,3 @@ struct kl_total_comm_cost_function { }; } // namespace osp - diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_cut_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_cut_cost.hpp deleted file mode 100644 index f13abda9..00000000 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_cut_cost.hpp +++ /dev/null @@ -1,431 +0,0 @@ -// /* -// Copyright 2024 Huawei Technologies Co., Ltd. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner -// */ - -// #pragma once - -// #include "../kl_active_schedule.hpp" -// #include "../kl_improver.hpp" - -// namespace osp { -// template -// struct kl_total_cut_cost_function { - -// using VertexType = vertex_idx_t; -// using kl_move = kl_move_struct; -// using kl_gain_update_info = kl_update_info; - -// constexpr static unsigned window_range = 2 * window_size + 1; -// constexpr static bool use_node_communication_costs = use_node_communication_costs_arg || not has_edge_weights_v; - -// kl_active_schedule *active_schedule; - -// compatible_processor_range *proc_range; - -// const Graph_t *graph; -// const BspInstance *instance; - -// cost_t comm_multiplier = 1; -// cost_t max_comm_weight = 0; - -// inline cost_t get_comm_multiplier() { return comm_multiplier; } -// inline cost_t get_max_comm_weight() { return max_comm_weight; } -// inline cost_t get_max_comm_weight_multiplied() { return max_comm_weight * comm_multiplier; } - -// const std::string name() const { return "toal_comm_cost"; } - -// inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); } - -// void initialize(kl_active_schedule &sched, compatible_processor_range &p_range) { -// active_schedule = &sched; -// proc_range = &p_range; -// instance = &sched.getInstance(); -// graph = &instance->getComputationalDag(); -// comm_multiplier = 1.0 / instance->numberOfProcessors(); -// } - -// cost_t compute_schedule_cost_test() { -// return compute_schedule_cost(); -// } - -// void update_datastructure_after_move(const kl_move&, const unsigned, const unsigned) {} - -// cost_t compute_schedule_cost() { - -// cost_t work_costs = 0; -// for (unsigned step = 0; step < active_schedule->num_steps(); step++) { -// work_costs += active_schedule->get_step_max_work(step); -// } - -// cost_t comm_costs = 0; -// for (const auto &edge : edges(*graph)) { - -// const auto &source_v = source(edge, *graph); -// const auto &target_v = target(edge, *graph); - -// const unsigned &source_proc = active_schedule->assigned_processor(source_v); -// const unsigned &target_proc = active_schedule->assigned_processor(target_v); - -// if ((source_proc != target_proc) || (active_schedule->assigned_superstep(source_v) != active_schedule->assigned_superstep(target_v))) { - -// if constexpr (use_node_communication_costs) { -// const cost_t source_comm_cost = graph->vertex_comm_weight(source_v); -// max_comm_weight = std::max(max_comm_weight, source_comm_cost); -// comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc); -// } else { -// const cost_t source_comm_cost = graph->edge_comm_weight(edge); -// max_comm_weight = std::max(max_comm_weight, source_comm_cost); -// comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc); -// } -// } -// } - -// return work_costs + comm_costs * comm_multiplier + static_cast>(active_schedule->num_steps() - 1) * instance->synchronisationCosts(); -// } - -// template -// void update_node_comm_affinity(const kl_move &move, thread_data_t& thread_data, const cost_t& penalty, const cost_t& reward, std::map & max_gain_recompute, std::vector &new_nodes) { - -// const unsigned & start_step = thread_data.start_step; -// const unsigned & end_step = thread_data.end_step; - -// for (const auto &target : instance->getComputationalDag().children(move.node)) { - -// const unsigned target_step = active_schedule->assigned_superstep(target); -// if (target_step < start_step || target_step > end_step) -// continue; - -// if(thread_data.lock_manager.is_locked(target)) -// continue; - -// if (not thread_data.affinity_table.is_selected(target)) { -// new_nodes.push_back(target); -// continue; -// } - -// if (max_gain_recompute.find(target) != max_gain_recompute.end()) { -// max_gain_recompute[target].full_update = true; -// } else { -// max_gain_recompute[target] = kl_gain_update_info(target, true); -// } - -// const unsigned target_proc = active_schedule->assigned_processor(target); -// const unsigned target_start_idx = start_idx(target_step, start_step); -// auto & affinity_table_target = thread_data.affinity_table.at(target); - -// if (move.from_step < target_step + (move.from_proc == target_proc)) { - -// const unsigned diff = target_step - move.from_step; -// const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; -// unsigned idx = target_start_idx; -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// affinity_table_target[p][idx] -= penalty; -// } -// } - -// if (idx - 1 < bound && is_compatible(target, move.from_proc)) { -// affinity_table_target[move.from_proc][idx - 1] += penalty; -// } - -// } else { - -// const unsigned diff = move.from_step - target_step; -// const unsigned window_bound = end_idx(target_step, end_step); -// unsigned idx = std::min(window_size + diff, window_bound); - -// if (idx < window_bound && is_compatible(target, move.from_proc)) { -// affinity_table_target[move.from_proc][idx] += reward; -// } - -// idx++; - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// affinity_table_target[p][idx] += reward; -// } -// } -// } - -// if (move.to_step < target_step + (move.to_proc == target_proc)) { -// unsigned idx = target_start_idx; -// const unsigned diff = target_step - move.to_step; -// const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// affinity_table_target[p][idx] += penalty; -// } -// } - -// if (idx - 1 < bound && is_compatible(target, move.to_proc)) { -// affinity_table_target[move.to_proc][idx - 1] -= penalty; -// } - -// } else { -// const unsigned diff = move.to_step - target_step; -// const unsigned window_bound = end_idx(target_step, end_step); -// unsigned idx = std::min(window_size + diff, window_bound); - -// if (idx < window_bound && is_compatible(target, move.to_proc)) { -// affinity_table_target[move.to_proc][idx] -= reward; -// } - -// idx++; - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// affinity_table_target[p][idx] -= reward; -// } -// } -// } - -// if (move.to_proc != move.from_proc) { -// const auto from_proc_target_comm_cost = instance->communicationCosts(move.from_proc, target_proc); -// const auto to_proc_target_comm_cost = instance->communicationCosts(move.to_proc, target_proc); - -// const cost_t comm_gain = graph->vertex_comm_weight(move.node) * comm_multiplier; - -// unsigned idx = target_start_idx; -// const unsigned window_bound = end_idx(target_step, end_step); -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(target)) { -// const auto x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_target_comm_cost, comm_gain); -// const auto y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_target_comm_cost, comm_gain); -// affinity_table_target[p][idx] += x - y; -// } -// } -// } -// } - -// for (const auto &source : instance->getComputationalDag().parents(move.node)) { - -// const unsigned source_step = active_schedule->assigned_superstep(source); -// if (source_step < start_step || source_step > end_step) -// continue; - -// if(thread_data.lock_manager.is_locked(source)) -// continue; - -// if (not thread_data.affinity_table.is_selected(source)) { -// new_nodes.push_back(source); -// continue; -// } - -// if (max_gain_recompute.find(source) != max_gain_recompute.end()) { -// max_gain_recompute[source].full_update = true; -// } else { -// max_gain_recompute[source] = kl_gain_update_info(source, true); -// } - -// const unsigned source_proc = active_schedule->assigned_processor(source); -// const unsigned window_bound = end_idx(source_step, end_step); -// auto & affinity_table_source = thread_data.affinity_table.at(source); - -// if (move.from_step < source_step + (move.from_proc != source_proc)) { - -// const unsigned diff = source_step - move.from_step; -// const unsigned bound = window_size > diff ? window_size - diff : 0; -// unsigned idx = start_idx(source_step, start_step); -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// affinity_table_source[p][idx] += reward; -// } -// } - -// if (window_size >= diff && is_compatible(source, move.from_proc)) { -// affinity_table_source[move.from_proc][idx] += reward; -// } - -// } else { - -// const unsigned diff = move.from_step - source_step; -// unsigned idx = window_size + diff; - -// if (idx < window_bound && is_compatible(source, move.from_proc)) { -// affinity_table_source[move.from_proc][idx] += penalty; -// } - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// affinity_table_source[p][idx] -= penalty; -// } -// } -// } - -// if (move.to_step < source_step + (move.to_proc != source_proc)) { -// const unsigned diff = source_step - move.to_step; -// const unsigned bound = window_size > diff ? window_size - diff : 0; -// unsigned idx = start_idx(source_step, start_step); -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// affinity_table_source[p][idx] -= reward; -// } -// } - -// if (window_size >= diff && is_compatible(source, move.to_proc)) { -// affinity_table_source[move.to_proc][idx] -= reward; -// } - -// } else { -// const unsigned diff = move.to_step - source_step; -// unsigned idx = window_size + diff; - -// if (idx < window_bound && is_compatible(source, move.to_proc)) { -// affinity_table_source[move.to_proc][idx] -= penalty; -// } -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// affinity_table_source[p][idx] += penalty; -// } -// } -// } - -// if (move.to_proc != move.from_proc) { -// const auto from_proc_source_comm_cost = instance->communicationCosts(source_proc, move.from_proc); -// const auto to_proc_source_comm_cost = instance->communicationCosts(source_proc, move.to_proc); - -// const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; - -// unsigned idx = start_idx(source_step, start_step); -// const unsigned window_bound = end_idx(source_step, end_step); -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(source)) { -// const cost_t x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_source_comm_cost, comm_gain); -// const cost_t y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_source_comm_cost, comm_gain); -// affinity_table_source[p][idx] += x - y; -// } -// } -// } -// } -// } - -// inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0; } -// inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return (node_step + window_size <= end_step) ? window_range : window_range - (node_step + window_size - end_step); } - -// inline cost_t change_comm_cost(const v_commw_t &p_target_comm_cost, const v_commw_t &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0;} - -// template -// void compute_comm_affinity(VertexType node, affinity_table_t& affinity_table_node, const cost_t& penalty, const cost_t& reward, const unsigned start_step, const unsigned end_step) { -// const unsigned node_step = active_schedule->assigned_superstep(node); -// const unsigned node_proc = active_schedule->assigned_processor(node); -// const unsigned window_bound = end_idx(node_step, end_step); -// const unsigned node_start_idx = start_idx(node_step, start_step); - -// for (const auto &target : instance->getComputationalDag().children(node)) { -// const unsigned target_step = active_schedule->assigned_superstep(target); -// const unsigned target_proc = active_schedule->assigned_processor(target); - -// if (target_step < node_step + (target_proc != node_proc)) { -// const unsigned diff = node_step - target_step; -// const unsigned bound = window_size > diff ? window_size - diff : 0; -// unsigned idx = node_start_idx; - -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// affinity_table_node[p][idx] -= reward; -// } -// } - -// if (window_size >= diff && is_compatible(node, target_proc)) { -// affinity_table_node[target_proc][idx] -= reward; -// } - -// } else { -// const unsigned diff = target_step - node_step; -// unsigned idx = window_size + diff; - -// if (idx < window_bound && is_compatible(node, target_proc)) { -// affinity_table_node[target_proc][idx] -= penalty; -// } - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// affinity_table_node[p][idx] += penalty; -// } -// } -// } - -// const cost_t comm_gain = graph->vertex_comm_weight(node) * comm_multiplier; -// const auto node_target_comm_cost = instance->communicationCosts(node_proc, target_proc); - -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// if (p != target_proc) { -// const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, target_proc), node_target_comm_cost, comm_gain); -// for (unsigned idx = node_start_idx; idx < window_bound; idx++) { -// affinity_table_node[p][idx] += comm_cost; -// } -// } else { -// for (unsigned idx = node_start_idx; idx < window_bound; idx++) { -// if(idx == 0) continue; -// affinity_table_node[p][idx] += comm_gain; -// } -// } -// } - -// } // traget - -// for (const auto &source : instance->getComputationalDag().parents(node)) { -// const unsigned source_step = active_schedule->assigned_superstep(source); -// const unsigned source_proc = active_schedule->assigned_processor(source); - -// if (source_step < node_step + (source_proc == node_proc)) { -// const unsigned diff = node_step - source_step; -// const unsigned bound = window_size >= diff ? window_size - diff + 1: 0; -// unsigned idx = node_start_idx; - -// for (; idx < bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// affinity_table_node[p][idx] += penalty; -// } -// } - -// if (idx - 1 < bound && is_compatible(node, source_proc)) { -// affinity_table_node[source_proc][idx - 1] -= penalty; -// } - -// } else { -// const unsigned diff = source_step - node_step; -// unsigned idx = std::min(window_size + diff, window_bound); - -// if (idx < window_bound && is_compatible(node, source_proc)) { -// affinity_table_node[source_proc][idx] -= reward; -// } - -// idx++; - -// for (; idx < window_bound; idx++) { -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// affinity_table_node[p][idx] -= reward; -// } -// } -// } - -// const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier; -// const auto source_node_comm_cost = instance->communicationCosts(source_proc, node_proc); - -// for (const unsigned p : proc_range->compatible_processors_vertex(node)) { -// const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, source_proc), source_node_comm_cost, comm_gain); -// for (unsigned idx = node_start_idx; idx < window_bound; idx++) { -// affinity_table_node[p][idx] += comm_cost; -// } -// } -// } // source -// } -// }; - -// } // namespace osp - diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp index 97bd35a7..3657ed52 100644 --- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp +++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp @@ -152,7 +152,7 @@ class kl_improver : public ImprovementScheduler { const Graph_t *graph; const BspInstance *instance; - compatible_processor_range proc_range; + CompatibleProcessorRange proc_range; kl_parameter parameters; std::mt19937 gen; diff --git a/include/osp/dag_divider/AbstractWavefrontScheduler.hpp b/include/osp/dag_divider/AbstractWavefrontScheduler.hpp index 556e82bc..69a3c80c 100644 --- a/include/osp/dag_divider/AbstractWavefrontScheduler.hpp +++ b/include/osp/dag_divider/AbstractWavefrontScheduler.hpp @@ -21,10 +21,10 @@ limitations under the License. #include "osp/graph_algorithms/computational_dag_util.hpp" #include "osp/graph_algorithms/subgraph_algorithms.hpp" #include "osp/graph_implementations/boost_graphs/boost_graph.hpp" -#include #include -#include #include +#include +#include namespace osp { @@ -34,7 +34,7 @@ namespace osp { */ template class AbstractWavefrontScheduler : public Scheduler { -protected: + protected: IDagDivider *divider; Scheduler *scheduler; static constexpr bool enable_debug_prints = true; @@ -46,17 +46,17 @@ class AbstractWavefrontScheduler : public Scheduler { */ bool distributeProcessors( unsigned total_processors_of_type, - const std::vector& work_weights, - std::vector& allocation) const { - + const std::vector &work_weights, + std::vector &allocation) const { + allocation.assign(work_weights.size(), 0); double total_work = std::accumulate(work_weights.begin(), work_weights.end(), 0.0); if (total_work <= 1e-9 || total_processors_of_type == 0) { return false; } - + std::vector active_indices; - for(size_t i = 0; i < work_weights.size(); ++i) { + for (size_t i = 0; i < work_weights.size(); ++i) { if (work_weights[i] > 1e-9) { active_indices.push_back(i); } @@ -68,7 +68,7 @@ class AbstractWavefrontScheduler : public Scheduler { size_t num_active_components = active_indices.size(); unsigned remaining_procs = total_processors_of_type; - + // --- Stage 1: Guarantee at least one processor if possible (anti-starvation) --- if (total_processors_of_type >= num_active_components) { // Abundance case: Give one processor to each active component first. @@ -79,11 +79,11 @@ class AbstractWavefrontScheduler : public Scheduler { } else { // Scarcity case: Not enough processors for each active component. std::vector> sorted_work; - for(size_t idx : active_indices) { + for (size_t idx : active_indices) { sorted_work.push_back({work_weights[idx], idx}); } std::sort(sorted_work.rbegin(), sorted_work.rend()); - for(unsigned i = 0; i < remaining_procs; ++i) { + for (unsigned i = 0; i < remaining_procs; ++i) { allocation[sorted_work[i].second]++; } return true; // Scarcity case was hit. @@ -93,10 +93,10 @@ class AbstractWavefrontScheduler : public Scheduler { if (remaining_procs > 0) { std::vector adjusted_work_weights; double adjusted_total_work = 0; - + double work_per_proc = total_work / static_cast(total_processors_of_type); - for(size_t idx : active_indices) { + for (size_t idx : active_indices) { double adjusted_work = std::max(0.0, work_weights[idx] - work_per_proc); adjusted_work_weights.push_back(adjusted_work); adjusted_total_work += adjusted_work; @@ -123,14 +123,13 @@ class AbstractWavefrontScheduler : public Scheduler { } } } - } + } return false; // Scarcity case was not hit. } - BspArchitecture createSubArchitecture( const BspArchitecture &original_arch, - const std::vector& sub_dag_proc_types) const { + const std::vector &sub_dag_proc_types) const { // The calculation is now inside the assert, so it only happens in debug builds. assert(std::accumulate(sub_dag_proc_types.begin(), sub_dag_proc_types.end(), 0u) > 0 && "Attempted to create a sub-architecture with zero processors."); @@ -142,33 +141,34 @@ class AbstractWavefrontScheduler : public Scheduler { sub_dag_processor_memory[original_arch.processorType(i)] = std::min(original_arch.memoryBound(i), sub_dag_processor_memory[original_arch.processorType(i)]); } - sub_architecture.set_processors_consequ_types(sub_dag_proc_types, sub_dag_processor_memory); + sub_architecture.SetProcessorsConsequTypes(sub_dag_proc_types, sub_dag_processor_memory); return sub_architecture; } - bool validateWorkDistribution(const std::vector& sub_dags, const BspInstance& instance) const { - const auto& original_arch = instance.getArchitecture(); - for (const auto& rep_sub_dag : sub_dags) { + bool validateWorkDistribution(const std::vector &sub_dags, const BspInstance &instance) const { + const auto &original_arch = instance.getArchitecture(); + for (const auto &rep_sub_dag : sub_dags) { const double total_rep_work = sumOfVerticesWorkWeights(rep_sub_dag); - + double sum_of_compatible_works_for_rep = 0.0; for (unsigned type_idx = 0; type_idx < original_arch.getNumberOfProcessorTypes(); ++type_idx) { sum_of_compatible_works_for_rep += sumOfCompatibleWorkWeights(rep_sub_dag, instance, type_idx); } if (sum_of_compatible_works_for_rep > total_rep_work + 1e-9) { - if constexpr (enable_debug_prints) std::cerr << "ERROR: Sum of compatible work (" << sum_of_compatible_works_for_rep - << ") exceeds total work (" << total_rep_work - << ") for a sub-dag. Aborting." << std::endl; + if constexpr (enable_debug_prints) + std::cerr << "ERROR: Sum of compatible work (" << sum_of_compatible_works_for_rep + << ") exceeds total work (" << total_rep_work + << ") for a sub-dag. Aborting." << std::endl; return false; } } return true; } -public: + public: AbstractWavefrontScheduler(IDagDivider &div, Scheduler &sched) : divider(&div), scheduler(&sched) {} }; -} +} // namespace osp diff --git a/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp b/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp index 5ba326d9..d1d61016 100644 --- a/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp +++ b/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp @@ -16,22 +16,22 @@ limitations under the License. @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner */ -#include -#include -#include -#include -#include "OrbitGraphProcessor.hpp" +#include "EftSubgraphScheduler.hpp" #include "HashComputer.hpp" #include "MerkleHashComputer.hpp" -#include "EftSubgraphScheduler.hpp" +#include "OrbitGraphProcessor.hpp" #include "TrimmedGroupScheduler.hpp" #include "osp/auxiliary/io/DotFileWriter.hpp" #include "osp/bsp/scheduler/Scheduler.hpp" #include "osp/graph_algorithms/subgraph_algorithms.hpp" +#include +#include +#include +#include namespace osp { -/** +/** * @brief A scheduler that leverages isomorphic subgraphs to partition a DAG. * * @class IsomorphicSubgraphScheduler @@ -58,12 +58,11 @@ class IsomorphicSubgraphScheduler { static_assert(std::is_same_v, vertex_idx_t>, "Graph_t and Constr_Graph_t must have the same vertex_idx types"); - private: - - static constexpr bool verbose = false; - const HashComputer>* hash_computer_; + private: + static constexpr bool verbose = false; + const HashComputer> *hash_computer_; size_t symmetry_ = 4; - Scheduler * bsp_scheduler_; + Scheduler *bsp_scheduler_; bool use_max_group_size_ = false; unsigned max_group_size_ = 0; bool plot_dot_graphs_ = false; @@ -76,22 +75,21 @@ class IsomorphicSubgraphScheduler { bool use_max_bsp = false; bool use_adaptive_symmetry_threshold = true; - public: - - explicit IsomorphicSubgraphScheduler(Scheduler & bsp_scheduler) + public: + explicit IsomorphicSubgraphScheduler(Scheduler &bsp_scheduler) : hash_computer_(nullptr), bsp_scheduler_(&bsp_scheduler), plot_dot_graphs_(false) {} - IsomorphicSubgraphScheduler(Scheduler & bsp_scheduler, const HashComputer>& hash_computer) + IsomorphicSubgraphScheduler(Scheduler &bsp_scheduler, const HashComputer> &hash_computer) : hash_computer_(&hash_computer), bsp_scheduler_(&bsp_scheduler), plot_dot_graphs_(false) {} virtual ~IsomorphicSubgraphScheduler() {} - void setMergeDifferentTypes(bool flag) {merge_different_node_types = flag;} - void setWorkThreshold(v_workw_t work_threshold) {work_threshold_ = work_threshold;} - void setCriticalPathThreshold(v_workw_t critical_path_threshold) {critical_path_threshold_ = critical_path_threshold;} - void setOrbitLockRatio(double orbit_lock_ratio) {orbit_lock_ratio_ = orbit_lock_ratio;} - void setNaturalBreaksCountPercentage(double natural_breaks_count_percentage) {natural_breaks_count_percentage_ = natural_breaks_count_percentage;} - void setAllowTrimmedScheduler(bool flag) {allow_use_trimmed_scheduler = flag;} + void setMergeDifferentTypes(bool flag) { merge_different_node_types = flag; } + void setWorkThreshold(v_workw_t work_threshold) { work_threshold_ = work_threshold; } + void setCriticalPathThreshold(v_workw_t critical_path_threshold) { critical_path_threshold_ = critical_path_threshold; } + void setOrbitLockRatio(double orbit_lock_ratio) { orbit_lock_ratio_ = orbit_lock_ratio; } + void setNaturalBreaksCountPercentage(double natural_breaks_count_percentage) { natural_breaks_count_percentage_ = natural_breaks_count_percentage; } + void setAllowTrimmedScheduler(bool flag) { allow_use_trimmed_scheduler = flag; } void set_plot_dot_graphs(bool plot) { plot_dot_graphs_ = plot; } void disable_use_max_group_size() { use_max_group_size_ = false; } void setUseMaxBsp(bool flag) { use_max_bsp = flag; } @@ -100,12 +98,12 @@ class IsomorphicSubgraphScheduler { max_group_size_ = max_group_size; } void setEnableAdaptiveSymmetryThreshold() { use_adaptive_symmetry_threshold = true; } - void setUseStaticSymmetryLevel(size_t static_symmetry_level) { - use_adaptive_symmetry_threshold = false; - symmetry_ = static_symmetry_level; + void setUseStaticSymmetryLevel(size_t static_symmetry_level) { + use_adaptive_symmetry_threshold = false; + symmetry_ = static_symmetry_level; } - std::vector> compute_partition(const BspInstance& instance) { + std::vector> compute_partition(const BspInstance &instance) { OrbitGraphProcessor orbit_processor; orbit_processor.set_work_threshold(work_threshold_); orbit_processor.setMergeDifferentNodeTypes(merge_different_node_types); @@ -116,7 +114,7 @@ class IsomorphicSubgraphScheduler { orbit_processor.setUseStaticSymmetryLevel(symmetry_); } - std::unique_ptr>> local_hasher; + std::unique_ptr>> local_hasher; if (!hash_computer_) { local_hasher = std::make_unique, true>>(instance.getComputationalDag(), instance.getComputationalDag()); hash_computer_ = local_hasher.get(); @@ -125,7 +123,7 @@ class IsomorphicSubgraphScheduler { orbit_processor.discover_isomorphic_groups(instance.getComputationalDag(), *hash_computer_); auto isomorphic_groups = orbit_processor.get_final_groups(); - + std::vector was_trimmed(isomorphic_groups.size(), false); trim_subgraph_groups(isomorphic_groups, instance, was_trimmed); // Apply trimming and record which groups were affected @@ -157,8 +155,7 @@ class IsomorphicSubgraphScheduler { return partition; } - protected: - + protected: template struct subgraph_scheduler_input { BspInstance instance; @@ -167,14 +164,14 @@ class IsomorphicSubgraphScheduler { std::vector>> required_proc_types; }; - void trim_subgraph_groups(std::vector::Group>& isomorphic_groups, - const BspInstance& instance, - std::vector& was_trimmed) { + void trim_subgraph_groups(std::vector::Group> &isomorphic_groups, + const BspInstance &instance, + std::vector &was_trimmed) { if constexpr (verbose) { std::cout << "\n--- Trimming Isomorphic Subgraph Groups ---" << std::endl; } for (size_t group_idx = 0; group_idx < isomorphic_groups.size(); ++group_idx) { - auto& group = isomorphic_groups[group_idx]; + auto &group = isomorphic_groups[group_idx]; const unsigned group_size = static_cast(group.size()); if (group_size <= 1) continue; @@ -194,24 +191,24 @@ class IsomorphicSubgraphScheduler { if constexpr (has_typed_vertices_v) { if (!group.subgraphs.empty() && !group.subgraphs[0].empty()) { common_node_type = instance.getComputationalDag().vertex_type(group.subgraphs[0][0]); - const auto& rep_subgraph = group.subgraphs[0]; - for (const auto& vertex : rep_subgraph) { + const auto &rep_subgraph = group.subgraphs[0]; + for (const auto &vertex : rep_subgraph) { if (instance.getComputationalDag().vertex_type(vertex) != common_node_type) { is_single_type_group = false; break; } } } else { - is_single_type_group = false; + is_single_type_group = false; } } else { - is_single_type_group = false; + is_single_type_group = false; } if (is_single_type_group) { // Dynamically determine min_proc_type_count based on compatible processors for this type unsigned min_compatible_processors = std::numeric_limits::max(); - const auto& proc_type_counts = instance.getArchitecture().getProcessorTypeCount(); + const auto &proc_type_counts = instance.getArchitecture().getProcessorTypeCount(); bool found_compatible_processor = false; for (unsigned proc_type_idx = 0; proc_type_idx < proc_type_counts.size(); ++proc_type_idx) { @@ -222,13 +219,13 @@ class IsomorphicSubgraphScheduler { } if (found_compatible_processor) { if constexpr (verbose) { - std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type + std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type << "). Min compatible processors: " << min_compatible_processors << "." << std::endl; } effective_min_proc_type_count = min_compatible_processors; } else { if constexpr (verbose) { - std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type + std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type << ") but no compatible processors found. Disabling trimming." << std::endl; } // If no compatible processors found for this type, effectively disable trimming for this group. @@ -236,7 +233,11 @@ class IsomorphicSubgraphScheduler { } } else { // Fallback to a default min_proc_type_count if not a single-type group or no typed vertices. - effective_min_proc_type_count = instance.getArchitecture().getMinProcessorTypeCount(); + const auto &type_count = instance.getArchitecture().getProcessorTypeCount(); + if (type_count.empty()) { + effective_min_proc_type_count = 0; + } + effective_min_proc_type_count = *std::min_element(type_count.begin(), type_count.end()); if constexpr (verbose) { std::cout << "Group " << group_idx << " (size " << group_size << "): Multi-type or untyped group. Using default min_proc_type_count: " << effective_min_proc_type_count << "." << std::endl; } @@ -257,13 +258,13 @@ class IsomorphicSubgraphScheduler { if (gcd < group_size) { if constexpr (verbose) { - std::cout << " -> Trimming group " << group_idx << ". GCD(" << group_size << ", " << effective_min_proc_type_count + std::cout << " -> Trimming group " << group_idx << ". GCD(" << group_size << ", " << effective_min_proc_type_count << ") = " << gcd << ". Merging " << group_size / gcd << " subgraphs at a time." << std::endl; } if (allow_use_trimmed_scheduler) gcd = 1; - + was_trimmed[group_idx] = true; const unsigned merge_size = group_size / gcd; std::vector>> new_subgraphs; @@ -279,7 +280,7 @@ class IsomorphicSubgraphScheduler { } for (unsigned k = 0; k < merge_size; ++k) { - const auto& sg_to_merge_vertices = group.subgraphs[original_sg_cursor]; + const auto &sg_to_merge_vertices = group.subgraphs[original_sg_cursor]; original_sg_cursor++; merged_sg_vertices.insert(merged_sg_vertices.end(), sg_to_merge_vertices.begin(), sg_to_merge_vertices.end()); } @@ -292,14 +293,14 @@ class IsomorphicSubgraphScheduler { } was_trimmed[group_idx] = false; } - } + } } subgraph_scheduler_input prepare_subgraph_scheduling_input( - const BspInstance& original_instance, - const std::vector::Group>& isomorphic_groups, - const std::vector& was_trimmed) { - + const BspInstance &original_instance, + const std::vector::Group> &isomorphic_groups, + const std::vector &was_trimmed) { + subgraph_scheduler_input result; result.instance.setArchitecture(original_instance.getArchitecture()); const unsigned num_proc_types = original_instance.getArchitecture().getNumberOfProcessorTypes(); @@ -332,35 +333,35 @@ class IsomorphicSubgraphScheduler { ++coarse_node_idx; } coarser_util::construct_coarse_dag(original_instance.getComputationalDag(), result.instance.getComputationalDag(), - contraction_map); + contraction_map); if constexpr (verbose) { std::cout << "\n--- Preparing Subgraph Scheduling Input ---\n"; std::cout << "Found " << isomorphic_groups.size() << " isomorphic groups to schedule as coarse nodes.\n"; for (size_t j = 0; j < isomorphic_groups.size(); ++j) { std::cout << " - Coarse Node " << j << " (from " << isomorphic_groups[j].subgraphs.size() - << " isomorphic subgraphs):\n"; + << " isomorphic subgraphs):\n"; std::cout << " - Multiplicity for scheduling: " << result.multiplicities[j] << "\n"; std::cout << " - Total Work (in coarse graph): " << result.instance.getComputationalDag().vertex_work_weight(j) << "\n"; std::cout << " - Required Processor Types: "; for (unsigned k = 0; k < num_proc_types; ++k) { std::cout << result.required_proc_types[j][k] << " "; } - std::cout << "\n"; + std::cout << "\n"; std::cout << " - Max number of processors: " << result.max_num_processors[j] << "\n"; } } return result; } - void schedule_isomorphic_group(const BspInstance& instance, - const std::vector::Group>& isomorphic_groups, - const SubgraphSchedule & sub_sched, - std::vector> & partition) { + void schedule_isomorphic_group(const BspInstance &instance, + const std::vector::Group> &isomorphic_groups, + const SubgraphSchedule &sub_sched, + std::vector> &partition) { vertex_idx_t current_partition_idx = 0; for (size_t group_idx = 0; group_idx < isomorphic_groups.size(); ++group_idx) { - const auto& group = isomorphic_groups[group_idx]; + const auto &group = isomorphic_groups[group_idx]; if (group.subgraphs.empty()) { continue; } @@ -373,47 +374,47 @@ class IsomorphicSubgraphScheduler { auto rep_global_to_local_map = create_induced_subgraph_map(instance.getComputationalDag(), representative_instance.getComputationalDag(), rep_subgraph_vertices_sorted); representative_instance.setArchitecture(instance.getArchitecture()); - const auto& procs_for_group = sub_sched.node_assigned_worker_per_type[group_idx]; + const auto &procs_for_group = sub_sched.node_assigned_worker_per_type[group_idx]; std::vector> mem_weights(procs_for_group.size(), 0); for (unsigned proc_type = 0; proc_type < procs_for_group.size(); ++proc_type) { mem_weights[proc_type] = static_cast>(instance.getArchitecture().maxMemoryBoundProcType(proc_type)); } - representative_instance.getArchitecture().set_processors_consequ_types(procs_for_group, mem_weights); + representative_instance.getArchitecture().SetProcessorsConsequTypes(procs_for_group, mem_weights); representative_instance.setNodeProcessorCompatibility(instance.getProcessorCompatibilityMatrix()); // --- Decide which scheduler to use --- unsigned min_non_zero_procs = std::numeric_limits::max(); - for (const auto& proc_count : procs_for_group) { + for (const auto &proc_count : procs_for_group) { if (proc_count > 0) { min_non_zero_procs = std::min(min_non_zero_procs, proc_count); } } - bool use_trimmed_scheduler = sub_sched.was_trimmed[group_idx] && min_non_zero_procs > 1 && allow_use_trimmed_scheduler; - - Scheduler* scheduler_for_group_ptr; + + Scheduler *scheduler_for_group_ptr; std::unique_ptr> trimmed_scheduler_owner; if (use_trimmed_scheduler) { - if constexpr (verbose) std::cout << "Using TrimmedGroupScheduler for group " << group_idx << std::endl; + if constexpr (verbose) + std::cout << "Using TrimmedGroupScheduler for group " << group_idx << std::endl; trimmed_scheduler_owner = std::make_unique>(*bsp_scheduler_, min_non_zero_procs); scheduler_for_group_ptr = trimmed_scheduler_owner.get(); } else { - if constexpr (verbose) std::cout << "Using standard BSP scheduler for group " << group_idx << std::endl; + if constexpr (verbose) + std::cout << "Using standard BSP scheduler for group " << group_idx << std::endl; scheduler_for_group_ptr = bsp_scheduler_; } - // --- Schedule the representative to get the pattern --- BspSchedule bsp_schedule(representative_instance); if constexpr (verbose) { std::cout << "--- Scheduling representative for group " << group_idx << " ---" << std::endl; std::cout << " Number of subgraphs in group: " << group.subgraphs.size() << std::endl; - const auto& rep_dag = representative_instance.getComputationalDag(); + const auto &rep_dag = representative_instance.getComputationalDag(); std::cout << " Representative subgraph size: " << rep_dag.num_vertices() << " vertices" << std::endl; std::vector node_type_counts(rep_dag.num_vertex_types(), 0); - for (const auto& v : rep_dag.vertices()) { + for (const auto &v : rep_dag.vertices()) { node_type_counts[rep_dag.vertex_type(v)]++; } std::cout << " Node type counts: "; @@ -424,45 +425,43 @@ class IsomorphicSubgraphScheduler { } std::cout << std::endl; - const auto& sub_arch = representative_instance.getArchitecture(); + const auto &sub_arch = representative_instance.getArchitecture(); std::cout << " Sub-architecture for scheduling:" << std::endl; std::cout << " Processors: " << sub_arch.numberOfProcessors() << std::endl; std::cout << " Processor types counts: "; - const auto& type_counts = sub_arch.getProcessorTypeCount(); + const auto &type_counts = sub_arch.getProcessorTypeCount(); for (size_t type_idx = 0; type_idx < type_counts.size(); ++type_idx) { std::cout << "T" << type_idx << ":" << type_counts[type_idx] << " "; } std::cout << std::endl; std::cout << " Sync cost: " << sub_arch.synchronisationCosts() << ", Comm cost: " << sub_arch.communicationCosts() << std::endl; std::cout << " Sub-problem compatibility matrix:" << std::endl; - const auto & sub_comp_matrix = representative_instance.getNodeNodeCompatabilityMatrix(); - for(unsigned i = 0; i < sub_comp_matrix.size(); ++i) { + const auto &sub_comp_matrix = representative_instance.getNodeNodeCompatabilityMatrix(); + for (unsigned i = 0; i < sub_comp_matrix.size(); ++i) { std::cout << " Node Type " << i << ": [ "; for (unsigned j = 0; j < sub_comp_matrix[i].size(); ++j) { std::cout << (sub_comp_matrix[i][j] ? "1" : "0") << " "; } std::cout << "]" << std::endl; } - } - + scheduler_for_group_ptr->computeSchedule(bsp_schedule); if constexpr (verbose) { - std::cout << " Schedule satisfies precedence constraints: "; + std::cout << " Schedule satisfies precedence constraints: "; std::cout << bsp_schedule.satisfiesPrecedenceConstraints() << std::endl; std::cout << " Schedule satisfies node type constraints: "; std::cout << bsp_schedule.satisfiesNodeTypeConstraints() << std::endl; } - if (plot_dot_graphs_) { - const auto& rep_dag = bsp_schedule.getInstance().getComputationalDag(); + const auto &rep_dag = bsp_schedule.getInstance().getComputationalDag(); std::vector colors(rep_dag.num_vertices()); std::map, unsigned> proc_ss_to_color; unsigned next_color = 0; - for (const auto& v : rep_dag.vertices()) { + for (const auto &v : rep_dag.vertices()) { const auto assignment = std::make_pair(bsp_schedule.assignedProcessor(v), bsp_schedule.assignedSuperstep(v)); if (proc_ss_to_color.find(assignment) == proc_ss_to_color.end()) { proc_ss_to_color[assignment] = next_color++; @@ -476,12 +475,10 @@ class IsomorphicSubgraphScheduler { ss << std::put_time(std::localtime(&in_time_t), "%Y%m%d_%H%M%S"); std::string timestamp = ss.str() + "_"; - DotFileWriter writer; writer.write_colored_graph(timestamp + "iso_group_rep_" + std::to_string(group_idx) + ".dot", rep_dag, colors); } - const bool max_bsp = use_max_bsp && (representative_instance.getComputationalDag().num_edges() == 0) && (representative_instance.getComputationalDag().vertex_type(0) == 0); // Build data structures for applying the pattern --- @@ -491,10 +488,9 @@ class IsomorphicSubgraphScheduler { for (vertex_idx_t j = 0; j < static_cast>(rep_subgraph_vertices_sorted.size()); ++j) { auto sp_pair = std::make_pair(bsp_schedule.assignedSuperstep(j), bsp_schedule.assignedProcessor(j)); - if (max_bsp) + if (max_bsp) sp_pair = std::make_pair(j, 0); - if (sp_proc_to_relative_partition.find(sp_pair) == sp_proc_to_relative_partition.end()) { sp_proc_to_relative_partition[sp_pair] = num_partitions_per_subgraph++; } @@ -516,12 +512,12 @@ class IsomorphicSubgraphScheduler { } else { // For other subgraphs, build the isomorphic mapping Constr_Graph_t current_subgraph_graph; create_induced_subgraph(instance.getComputationalDag(), current_subgraph_graph, current_subgraph_vertices_sorted); - + MerkleHashComputer current_hasher(current_subgraph_graph); - for(const auto& [hash, rep_orbit_nodes] : rep_hasher.get_orbits()) { - const auto& current_orbit_nodes = current_hasher.get_orbit_from_hash(hash); - for(size_t k = 0; k < rep_orbit_nodes.size(); ++k) { + for (const auto &[hash, rep_orbit_nodes] : rep_hasher.get_orbits()) { + const auto ¤t_orbit_nodes = current_hasher.get_orbit_from_hash(hash); + for (size_t k = 0; k < rep_orbit_nodes.size(); ++k) { // Map: current_subgraph_vertex -> representative_subgraph_local_idx current_vertex_to_rep_local_idx[current_subgraph_vertices_sorted[current_orbit_nodes[k]]] = static_cast>(rep_orbit_nodes[k]); } @@ -529,11 +525,11 @@ class IsomorphicSubgraphScheduler { } // Apply the partition pattern - for (const auto& current_vertex : current_subgraph_vertices_sorted) { + for (const auto ¤t_vertex : current_subgraph_vertices_sorted) { const auto rep_local_idx = current_vertex_to_rep_local_idx.at(current_vertex); auto sp_pair = std::make_pair(bsp_schedule.assignedSuperstep(rep_local_idx), bsp_schedule.assignedProcessor(rep_local_idx)); - if (max_bsp) + if (max_bsp) sp_pair = std::make_pair(rep_local_idx, 0); partition[current_vertex] = current_partition_idx + sp_proc_to_relative_partition.at(sp_pair); @@ -544,4 +540,4 @@ class IsomorphicSubgraphScheduler { } }; -} \ No newline at end of file +} // namespace osp \ No newline at end of file diff --git a/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp b/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp index 0b125e71..88dcf1fa 100644 --- a/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp +++ b/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp @@ -19,8 +19,8 @@ limitations under the License. #pragma once #include "osp/bsp/scheduler/Scheduler.hpp" -#include "osp/graph_algorithms/subgraph_algorithms.hpp" #include "osp/graph_algorithms/computational_dag_util.hpp" +#include "osp/graph_algorithms/subgraph_algorithms.hpp" #include #include @@ -35,7 +35,7 @@ namespace osp { * potentially disconnected, subgraph that resulted from merging smaller isomorphic subgraphs. It divides * the input graph into its weakly connected components and schedules them on proportionally allocated processors. */ -template +template class TrimmedGroupScheduler : public Scheduler { Scheduler *sub_scheduler; @@ -94,7 +94,7 @@ class TrimmedGroupScheduler : public Scheduler { // Determine the processor allocation for a single sub-problem. // Calculate offsets for processor types within the main 'arch' (passed to TrimmedGroupScheduler) std::vector arch_proc_type_offsets(arch.getNumberOfProcessorTypes(), 0); - const auto& arch_proc_type_counts = arch.getProcessorTypeCount(); + const auto &arch_proc_type_counts = arch.getProcessorTypeCount(); for (unsigned type_idx = 1; type_idx < arch.getNumberOfProcessorTypes(); ++type_idx) { arch_proc_type_offsets[type_idx] = arch_proc_type_offsets[type_idx - 1] + arch_proc_type_counts[type_idx - 1]; } @@ -115,12 +115,12 @@ class TrimmedGroupScheduler : public Scheduler { } // Create the sub-architecture for one sub-problem. - BspArchitecture sub_arch(arch); - sub_arch.set_processors_consequ_types(sub_proc_counts, mem_weights); + BspArchitecture sub_arch(arch); + sub_arch.SetProcessorsConsequTypes(sub_proc_counts, mem_weights); // Calculate offsets for processor types within the 'sub_arch' std::vector sub_arch_proc_type_offsets(sub_arch.getNumberOfProcessorTypes(), 0); - const auto& sub_arch_proc_type_counts = sub_arch.getProcessorTypeCount(); + const auto &sub_arch_proc_type_counts = sub_arch.getProcessorTypeCount(); for (unsigned type_idx = 1; type_idx < sub_arch.getNumberOfProcessorTypes(); ++type_idx) { sub_arch_proc_type_offsets[type_idx] = sub_arch_proc_type_offsets[type_idx - 1] + sub_arch_proc_type_counts[type_idx - 1]; } @@ -135,8 +135,8 @@ class TrimmedGroupScheduler : public Scheduler { std::sort(group_vertices.begin(), group_vertices.end()); BspInstance sub_instanc; - sub_instanc.setArchitecture(sub_arch); // Set the sub-architecture - sub_instanc.setNodeProcessorCompatibility(instance.getNodeProcessorCompatibilityMatrix()); // Inherit compatibility + sub_instanc.setArchitecture(sub_arch); // Set the sub-architecture + sub_instanc.setNodeProcessorCompatibility(instance.getNodeProcessorCompatibilityMatrix()); // Inherit compatibility auto global_to_local_map = create_induced_subgraph_map(dag, sub_instanc.getComputationalDag(), group_vertices); // Create induced subgraph // Create a schedule object for the sub-problem @@ -144,10 +144,11 @@ class TrimmedGroupScheduler : public Scheduler { // Call the sub-scheduler to compute the schedule for this group of components auto status = sub_scheduler->computeSchedule(sub_schedule); - if (status != RETURN_STATUS::OSP_SUCCESS && status != RETURN_STATUS::BEST_FOUND) return status; + if (status != RETURN_STATUS::OSP_SUCCESS && status != RETURN_STATUS::BEST_FOUND) + return status; // Map the sub-schedule back to the main schedule. - for (const auto& v_global : group_vertices) { + for (const auto &v_global : group_vertices) { const auto v_local = global_to_local_map.at(v_global); const unsigned sub_proc = sub_schedule.assignedProcessor(v_local); const unsigned sub_superstep = sub_schedule.assignedSuperstep(v_local); diff --git a/include/osp/graph_algorithms/computational_dag_construction_util.hpp b/include/osp/graph_algorithms/computational_dag_construction_util.hpp index e85217e9..553996a6 100644 --- a/include/osp/graph_algorithms/computational_dag_construction_util.hpp +++ b/include/osp/graph_algorithms/computational_dag_construction_util.hpp @@ -34,7 +34,7 @@ namespace osp { * @tparam Graph_to The type of the target graph. Must satisfy `is_constructable_cdag_vertex`. * @param from The source graph. * @param to The target graph. - */ + */ template void constructComputationalDag(const Graph_from &from, Graph_to &to) { static_assert(is_computational_dag_v, "Graph_from must satisfy the computational_dag concept"); @@ -46,21 +46,21 @@ void constructComputationalDag(const Graph_from &from, Graph_to &to) { for (const auto &v_idx : from.vertices()) { if constexpr (has_typed_vertices_v and has_typed_vertices_v) { vertex_map.push_back(to.add_vertex(from.vertex_work_weight(v_idx), from.vertex_comm_weight(v_idx), - from.vertex_mem_weight(v_idx), from.vertex_type(v_idx))); + from.vertex_mem_weight(v_idx), from.vertex_type(v_idx))); } else { vertex_map.push_back(to.add_vertex(from.vertex_work_weight(v_idx), from.vertex_comm_weight(v_idx), - from.vertex_mem_weight(v_idx))); + from.vertex_mem_weight(v_idx))); } } if constexpr (has_edge_weights_v and has_edge_weights_v) { for (const auto &e : edges(from)) { - to.add_edge(vertex_map.at(source(e, from)), vertex_map.at(target(e, from)), from.edge_comm_weight(e)); + to.add_edge(vertex_map[source(e, from)], vertex_map[target(e, from)], from.edge_comm_weight(e)); } } else { for (const auto &v : from.vertices()) { for (const auto &child : from.children(v)) { - to.add_edge(vertex_map.at(v), vertex_map.at(child)); + to.add_edge(vertex_map[v], vertex_map[child]); } } } diff --git a/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp b/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp index 0b67ab30..616aea6b 100644 --- a/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp +++ b/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp @@ -17,6 +17,8 @@ limitations under the License. */ #pragma once +#include // for std::size_t + namespace osp { /** @@ -71,17 +73,17 @@ struct cdag_vertex_impl { }; /** - * @brief A vertex implementation with integer weights. Indexed by size_t. Node types are unsigned. + * @brief A vertex implementation with integer weights. Indexed by std::size_t. Node types are unsigned. * * This struct implements a vertex with integer weights for work, communication, and memory. */ -using cdag_vertex_impl_int = cdag_vertex_impl; +using cdag_vertex_impl_int = cdag_vertex_impl; /** - * @brief A vertex implementation with unsigned weights. Indexed by size_t. Node types are unsigned. + * @brief A vertex implementation with unsigned weights. Indexed by std::size_t. Node types are unsigned. * * This struct implements a vertex with unsigned weights for work, communication, and memory. */ -using cdag_vertex_impl_unsigned = cdag_vertex_impl; +using cdag_vertex_impl_unsigned = cdag_vertex_impl; } // namespace osp \ No newline at end of file diff --git a/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp b/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp index 74340de6..efe1996e 100644 --- a/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp +++ b/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp @@ -32,8 +32,8 @@ namespace osp { * @brief A vector-based implementation of a computational DAG. * * This class implements a computational DAG using adjacency lists stored in two std::vectors. - * It manages the storage of vertices and edges, and provides an interface to query and modify the graph. - * + * It manages the storage of vertices and edges, and provides an interface to query and modify the graph. + * * This class satisfies the following concepts: * - `is_computational_dag_typed_vertices` * - `is_directed_graph` @@ -80,7 +80,7 @@ class computational_dag_vector_impl { num_vertex_types_(0) { for (vertex_idx i = 0; i < num_vertices; ++i) { - vertices_.at(i).id = i; + vertices_[i].id = i; } } @@ -150,40 +150,40 @@ class computational_dag_vector_impl { [[nodiscard]] vertex_idx num_edges() const { return num_edges_; } /** - * @brief Returns the parents (in-neighbors) of a vertex. + * @brief Returns the parents (in-neighbors) of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] const std::vector &parents(const vertex_idx v) const { return in_neigbors.at(v); } + [[nodiscard]] const std::vector &parents(const vertex_idx v) const { return in_neigbors[v]; } /** - * @brief Returns the children (out-neighbors) of a vertex. + * @brief Returns the children (out-neighbors) of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] const std::vector &children(const vertex_idx v) const { return out_neigbors.at(v); } + [[nodiscard]] const std::vector &children(const vertex_idx v) const { return out_neigbors[v]; } /** - * @brief Returns the in-degree of a vertex. + * @brief Returns the in-degree of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast(in_neigbors.at(v).size()); } + [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast(in_neigbors[v].size()); } /** - * @brief Returns the out-degree of a vertex. + * @brief Returns the out-degree of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast(out_neigbors.at(v).size()); } + [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast(out_neigbors[v].size()); } - [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_.at(v).work_weight; } + [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_[v].work_weight; } - [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_.at(v).comm_weight; } + [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_[v].comm_weight; } - [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_.at(v).mem_weight; } + [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_[v].mem_weight; } - [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_.at(v).vertex_type; } + [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_[v].vertex_type; } [[nodiscard]] vertex_type_type num_vertex_types() const { return num_vertex_types_; } - [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_.at(v); } + [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_[v]; } /** * @brief Adds a new isolated vertex to the graph. @@ -240,7 +240,7 @@ class computational_dag_vector_impl { return false; } - out_neigbors.at(source).push_back(target); + out_neigbors[source].push_back(target); in_neigbors.at(target).push_back(source); num_edges_++; @@ -267,7 +267,6 @@ using computational_dag_vector_impl_def_t = computational_dag_vector_impl; - static_assert(is_directed_graph_edge_desc_v>, "computational_dag_vector_impl must satisfy the directed_graph_edge_desc concept"); diff --git a/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp b/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp index 1deadcee..3ab94872 100644 --- a/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp +++ b/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp @@ -80,8 +80,8 @@ class dag_vector_adapter { dag_vector_adapter(const std::vector> &out_neigbors_, const std::vector> &in_neigbors_) : vertices_(out_neigbors_.size()), out_neigbors(&out_neigbors_), in_neigbors(&in_neigbors_), num_edges_(0), num_vertex_types_(1) { for (vertex_idx i = 0; i < static_cast(out_neigbors_.size()); ++i) { - vertices_.at(i).id = i; - num_edges_ += out_neigbors_.at(i).size(); + vertices_[i].id = i; + num_edges_ += out_neigbors_[i].size(); } } @@ -107,8 +107,8 @@ class dag_vector_adapter { num_edges_ = 0; for (vertex_idx i = 0; i < static_cast(out_neigbors->size()); ++i) { - vertices_.at(i).id = i; - num_edges_ += out_neigbors->at(i).size(); + vertices_[i].id = i; + num_edges_ += out_neigbors_[i].size(); } num_vertex_types_ = 1; @@ -130,40 +130,40 @@ class dag_vector_adapter { [[nodiscard]] vertex_idx num_edges() const { return static_cast(num_edges_); } /** - * @brief Returns a view of the parents (in-neighbors) of a vertex. + * @brief Returns a view of the parents (in-neighbors) of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] auto parents(const vertex_idx v) const { return vector_cast_view(in_neigbors->at(v)); } + [[nodiscard]] auto parents(const vertex_idx v) const { return vector_cast_view((*in_neigbors)[v]); } /** - * @brief Returns a view of the children (out-neighbors) of a vertex. + * @brief Returns a view of the children (out-neighbors) of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] auto children(const vertex_idx v) const { return vector_cast_view(out_neigbors->at(v)); } + [[nodiscard]] auto children(const vertex_idx v) const { return vector_cast_view((*out_neigbors)[v]); } /** - * @brief Returns the in-degree of a vertex. + * @brief Returns the in-degree of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast(in_neigbors->at(v).size()); } + [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast((*in_neigbors)[v].size()); } /** - * @brief Returns the out-degree of a vertex. + * @brief Returns the out-degree of a vertex. Does not perform bounds checking. * @param v The vertex index. */ - [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast(out_neigbors->at(v).size()); } + [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast((*out_neigbors)[v].size()); } - [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_.at(v).work_weight; } + [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_[v].work_weight; } - [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_.at(v).comm_weight; } + [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_[v].comm_weight; } - [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_.at(v).mem_weight; } + [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_[v].mem_weight; } - [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_.at(v).vertex_type; } + [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_[v].vertex_type; } [[nodiscard]] vertex_type_type num_vertex_types() const { return num_vertex_types_; } - [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_.at(v); } + [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_[v]; } void set_vertex_work_weight(const vertex_idx v, const vertex_work_weight_type work_weight) { vertices_.at(v).work_weight = work_weight; @@ -192,7 +192,6 @@ class dag_vector_adapter { unsigned num_vertex_types_ = 0; }; - static_assert(is_directed_graph_edge_desc_v>, "dag_vector_adapter must satisfy the directed_graph_edge_desc concept"); diff --git a/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp b/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp index e8fbe586..b42ea17d 100644 --- a/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp +++ b/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp @@ -141,7 +141,7 @@ class vector_cast_view { * @param i The index of the element to access. * @return The element at index i, cast to to_t. */ - [[nodiscard]] auto operator[](std::size_t i) const { return static_cast(vec.at(i)); } + [[nodiscard]] auto operator[](std::size_t i) const { return static_cast(vec[i]); } }; } // namespace osp \ No newline at end of file diff --git a/tests/bsp_architecture.cpp b/tests/bsp_architecture.cpp index af26e034..d803bb56 100644 --- a/tests/bsp_architecture.cpp +++ b/tests/bsp_architecture.cpp @@ -19,8 +19,8 @@ limitations under the License. #define BOOST_TEST_MODULE Bsp_Architecture #include -#include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" #include "osp/bsp/model/BspArchitecture.hpp" +#include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" using namespace osp; @@ -61,18 +61,18 @@ BOOST_AUTO_TEST_CASE(ParameterizedConstructorTest) { BOOST_CHECK_EQUAL(architecture.maxMemoryBoundProcType(0), 100); - BOOST_TEST(architecture.sendCostMatrix() == uniform_sent_costs); + BOOST_TEST(architecture.sendCost() == uniform_sent_costs); std::vector> expectedSendCosts = {{0, 2, 2, 2}, {2, 0, 2, 2}, {2, 2, 0, 2}, {2, 2, 2, 0}}; - architecture.setSendCosts(expectedSendCosts); - BOOST_TEST(architecture.sendCostMatrix() == expectedSendCosts); + architecture.SetSendCosts(expectedSendCosts); + BOOST_TEST(architecture.sendCost() == expectedSendCosts); BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 1), 4); BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 0), 0); architecture.SetUniformSendCost(); - BOOST_TEST(architecture.sendCostMatrix() == uniform_sent_costs); + BOOST_TEST(architecture.sendCost() == uniform_sent_costs); BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 1), 2); BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 0), 0); @@ -141,8 +141,7 @@ BOOST_AUTO_TEST_CASE(Architecture) { } // constructor - std::vector> send_costs = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, - {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; + std::vector> send_costs = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; BOOST_CHECK_THROW(BspArchitecture test31(7, 42942, 0, send_costs), std::invalid_argument); @@ -169,10 +168,8 @@ BOOST_AUTO_TEST_CASE(Architecture) { } // constructor - std::vector> send_costs2 = {{0, 1, 2, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, - {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; - std::vector> send_costs3 = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, - {3, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; + std::vector> send_costs2 = {{0, 1, 2, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; + std::vector> send_costs3 = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, {3, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}}; BspArchitecture test4(6, 0, 4294965, send_costs2); BOOST_CHECK_EQUAL(test4.numberOfProcessors(), 6); diff --git a/tests/bsp_instance.cpp b/tests/bsp_instance.cpp index c2b0b02a..60e95999 100644 --- a/tests/bsp_instance.cpp +++ b/tests/bsp_instance.cpp @@ -19,12 +19,12 @@ limitations under the License. #define BOOST_TEST_MODULE Bsp_Architecture #include +#include "osp/auxiliary/io/arch_file_reader.hpp" +#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include "osp/bsp/model/BspInstance.hpp" #include "osp/bsp/model/BspSchedule.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" -#include "osp/auxiliary/io/arch_file_reader.hpp" -#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include #include @@ -84,8 +84,7 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) { BOOST_CHECK_EQUAL(instance.isCompatible(0, 0), true); BOOST_CHECK_EQUAL(instance.isCompatible(1, 0), false); - - compatible_processor_range range(instance); + CompatibleProcessorRange range(instance); BOOST_CHECK_EQUAL(range.compatible_processors_type(0).size(), 3); BOOST_CHECK_EQUAL(range.compatible_processors_type(1).size(), 1); @@ -97,7 +96,6 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) { } std::cout << std::endl; - std::cout << "Compatible processors type 1: " << std::endl; for (const auto &p : range.compatible_processors_type(1)) { @@ -105,7 +103,6 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) { } std::cout << std::endl; - BOOST_CHECK_EQUAL(range.compatible_processors_vertex(0).size(), 1); BOOST_CHECK_EQUAL(range.compatible_processors_vertex(1).size(), 3); BOOST_CHECK_EQUAL(range.compatible_processors_vertex(2).size(), 3); diff --git a/tests/debug_merkle_divider.cpp b/tests/debug_merkle_divider.cpp index bf3bd1b5..5763d840 100644 --- a/tests/debug_merkle_divider.cpp +++ b/tests/debug_merkle_divider.cpp @@ -16,24 +16,23 @@ limitations under the License. @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner */ -#include -#include "osp/auxiliary/io/dot_graph_file_reader.hpp" #include "osp/auxiliary/io/DotFileWriter.hpp" +#include "osp/auxiliary/io/dot_graph_file_reader.hpp" #include "osp/bsp/scheduler/GreedySchedulers/BspLocking.hpp" -#include "osp/bsp/scheduler/Serial.hpp" -#include "osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GreedyChildren.hpp" +#include "osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCores.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp" +#include "osp/bsp/scheduler/Serial.hpp" #include "osp/coarser/coarser_util.hpp" #include "osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp" +#include using namespace osp; - template -void check_partition_type_homogeneity(const GraphT& dag, const std::vector>& partition) { +void check_partition_type_homogeneity(const GraphT &dag, const std::vector> &partition) { // Group partitions by their ID std::map, std::vector>> partitions; for (vertex_idx_t i = 0; i < dag.num_vertices(); ++i) { @@ -41,19 +40,20 @@ void check_partition_type_homogeneity(const GraphT& dag, const std::vector" << std::endl; return 1; @@ -76,15 +76,12 @@ int main(int argc, char* argv[]) { instance.getComputationalDag().set_vertex_comm_weight(v, static_cast>(instance.getComputationalDag().vertex_comm_weight(v) * 0.01)); } - // Set up architecture - instance.getArchitecture().set_processors_consequ_types({24,48},{100,100}); + instance.getArchitecture().SetProcessorsConsequTypes({24, 48}, {100, 100}); instance.setDiagonalCompatibilityMatrix(2); instance.setSynchronisationCosts(2000); instance.setCommunicationCosts(1); - - // Set up the scheduler GrowLocalAutoCores growlocal; BspLocking locking; @@ -95,9 +92,9 @@ int main(int argc, char* argv[]) { ComboScheduler growlocal_kl(growlocal, kl); ComboScheduler locking_kl(locking, kl); ComboScheduler children_kl(children, kl); - + GreedyMetaScheduler scheduler; - //scheduler.addScheduler(growlocal_kl); + // scheduler.addScheduler(growlocal_kl); scheduler.addScheduler(locking_kl); scheduler.addScheduler(children_kl); scheduler.addSerialScheduler(); @@ -120,7 +117,7 @@ int main(int argc, char* argv[]) { graph_t corase_graph; coarser_util::construct_coarse_dag(instance.getComputationalDag(), corase_graph, partition); bool acyc = is_acyclic(corase_graph); - std::cout << "Partition is " << (acyc ? "acyclic." : "not acyclic."); + std::cout << "Partition is " << (acyc ? "acyclic." : "not acyclic."); std::cout << "Partition computation finished." << std::endl; std::cout << "Generated " << std::set>(partition.begin(), partition.end()).size() << " partitions." << std::endl; diff --git a/tests/kl_bsp_improver_test.cpp b/tests/kl_bsp_improver_test.cpp index df3ac3f1..6e1611ec 100644 --- a/tests/kl_bsp_improver_test.cpp +++ b/tests/kl_bsp_improver_test.cpp @@ -152,7 +152,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { // std::vector> send_cost = {{0, 1, 4, 4}, {1, 0, 4, 4}, {4, 4, 0, 1}, {4, 4, 1, 0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { diff --git a/tests/kl_lambda.cpp b/tests/kl_lambda.cpp index a7f40cf4..31f86130 100644 --- a/tests/kl_lambda.cpp +++ b/tests/kl_lambda.cpp @@ -25,14 +25,14 @@ limitations under the License. #include "osp/bsp/scheduler/LocalSearch/KernighanLin/kl_total_comm.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin/kl_total_cut.hpp" +#include "osp/auxiliary/io/arch_file_reader.hpp" +#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp" -#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" -#include "osp/auxiliary/io/arch_file_reader.hpp" -#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" -#include "test_graphs.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" +#include "test_graphs.hpp" using namespace osp; @@ -56,37 +56,38 @@ void add_node_types(Graph_t &dag) { for (const auto &v : dag.vertices()) { dag.set_vertex_type(v, node_type++ % 2); - } + } } template -void check_equal_affinity_table(table_t & table_1, table_t & table_2, const std::set & nodes) { +void check_equal_affinity_table(table_t &table_1, table_t &table_2, const std::set &nodes) { - for ( auto i : nodes) { + for (auto i : nodes) { BOOST_CHECK_EQUAL(table_1[i].size(), table_2[i].size()); - if (table_1[i].size() != table_2[i].size()) continue; + if (table_1[i].size() != table_2[i].size()) + continue; for (size_t j = 0; j < table_1[i].size(); ++j) { BOOST_CHECK_EQUAL(table_1[i][j].size(), table_2[i][j].size()); - if (table_1[i][j].size() != table_2[i][j].size()) continue; + if (table_1[i][j].size() != table_2[i][j].size()) + continue; for (size_t k = 0; k < table_1[i][j].size(); ++k) { BOOST_CHECK(std::abs(table_1[i][j][k] - table_2[i][j][k]) < 0.000001); - if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) { - std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl; - + if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) { + std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl; } } } } } -void check_equal_lambda_map(const std::vector> & map_1, const std::vector> & map_2) { +void check_equal_lambda_map(const std::vector> &map_1, const std::vector> &map_2) { BOOST_CHECK_EQUAL(map_1.size(), map_2.size()); if (map_1.size() != map_2.size()) return; for (size_t i = 0; i < map_1.size(); ++i) { - for (const auto & [key, value] : map_1[i]) { + for (const auto &[key, value] : map_1[i]) { BOOST_CHECK_EQUAL(value, map_2[i].at(key)); if (value != map_2[i].at(key)) { @@ -117,7 +118,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_with_node_types_test) { BspInstance instance; bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); instance.getArchitecture().setSynchronisationCosts(5); instance.getArchitecture().setCommunicationCosts(5); @@ -134,7 +135,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_with_node_types_test) { add_mem_weights(instance.getComputationalDag()); add_node_types(instance.getComputationalDag()); - instance.getArchitecture().setProcessorsWithTypes({0,0,1,1}); + instance.getArchitecture().setProcessorsWithTypes({0, 0, 1, 1}); instance.setDiagonalCompatibilityMatrix(2); @@ -147,18 +148,15 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_with_node_types_test) { BOOST_CHECK(schedule.satisfiesNodeTypeConstraints()); kl_total_lambda_comm_improver kl; - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); BOOST_CHECK(schedule.satisfiesNodeTypeConstraints()); - } } - - BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { std::vector filenames_graph = test_graphs(); @@ -180,7 +178,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { BspInstance instance; bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); instance.getArchitecture().setSynchronisationCosts(5); instance.getArchitecture().setCommunicationCosts(5); @@ -204,7 +202,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); kl_total_lambda_comm_improver kl; - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -252,11 +250,11 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); // schedule.updateNumberOfSupersteps(); - -// using cost_f = kl_hyper_total_comm_cost_function; + +// using cost_f = kl_hyper_total_comm_cost_function; // using kl_improver_test = kl_improver_test; // kl_improver_test kl; - + // kl.setup_schedule(schedule); // auto &kl_active_schedule = kl.get_active_schedule(); @@ -269,7 +267,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + // BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); // BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true); @@ -369,7 +367,6 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // using graph = computational_dag_edge_idx_vector_impl_def_int_t; // using VertexType = graph::vertex_idx; // using kl_move = kl_move_struct; - // graph dag; @@ -401,11 +398,11 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); // schedule.updateNumberOfSupersteps(); - -// using cost_f = kl_hyper_total_comm_cost_function; + +// using cost_f = kl_hyper_total_comm_cost_function; // using kl_improver_test = kl_improver_test; // kl_improver_test kl; - + // kl.setup_schedule(schedule); // auto &kl_active_schedule = kl.get_active_schedule(); @@ -418,7 +415,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) { // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); // BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + // auto node_selection = kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7}); // std::set nodes_to_check = {0, 1, 2, 3, 4, 5, 6, 7}; @@ -533,11 +530,10 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_penalty_test) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_hyper_total_comm_cost_function; + using cost_f = kl_hyper_total_comm_cost_function; using kl_improver_test = kl_improver_test; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -550,48 +546,47 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_penalty_test) { BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true); - auto node_selection = kl.insert_gain_heap_test_penalty({2,3}); + auto node_selection = kl.insert_gain_heap_test_penalty({2, 3}); auto recompute_max_gain = kl.run_inner_iteration_test(); // best move 3 - std::cout << "------------------------recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "------------------------recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } - std::cout << "}" << std::endl; + } + std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); // best move 0 - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); // best move 1 - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; - + BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - } BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { @@ -629,27 +624,27 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); schedule.updateNumberOfSupersteps(); - - using cost_f = kl_hyper_total_comm_cost_function; + + using cost_f = kl_hyper_total_comm_cost_function; using kl_improver_test = kl_improver_test; kl_improver_test kl; - + kl.setup_schedule(schedule); BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - auto node_selection = kl.insert_gain_heap_test_penalty({7}); + auto node_selection = kl.insert_gain_heap_test_penalty({7}); auto recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "-----------recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "-----------recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } - std::cout << "}" << std::endl; + } + std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - - auto& lambda_map = kl.get_comm_cost_f().node_lambda_map; + + auto &lambda_map = kl.get_comm_cost_f().node_lambda_map; BOOST_CHECK(lambda_map.get_proc_entry(v1, 0) == 2); BOOST_CHECK(lambda_map.get_proc_entry(v1, 1) == 1); @@ -669,32 +664,31 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { BOOST_CHECK(lambda_map.has_no_proc_entry(v8, 0)); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - } // BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs) { @@ -708,7 +702,6 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // std::cout << cwd << std::endl; // } - // for (auto &filename_graph : filenames_graph) { // GreedyBspScheduler test_scheduler; // BspInstance instance; @@ -724,7 +717,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // {4,4,0,1}, // {4,4,1,0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { @@ -752,7 +745,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // auto status = kl.improveSchedule(schedule); // auto finish_time = std::chrono::high_resolution_clock::now(); // auto duration = std::chrono::duration_cast(finish_time - start_time).count(); - + // std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalLambdaCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl; // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -763,18 +756,17 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // // start_time = std::chrono::high_resolution_clock::now(); // // status = kl_old.improve_schedule_test_2(schedule_2); // // finish_time = std::chrono::high_resolution_clock::now(); - + // // duration = std::chrono::duration_cast(finish_time - start_time).count(); // // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl; - + // // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); // // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); // } // } - // BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs_mt) { // std::vector filenames_graph = large_spaa_graphs(); // using graph = computational_dag_edge_idx_vector_impl_def_int_t; @@ -786,7 +778,6 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // std::cout << cwd << std::endl; // } - // for (auto &filename_graph : filenames_graph) { // GreedyBspScheduler test_scheduler; // BspInstance instance; @@ -802,7 +793,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // {4,4,0,1}, // {4,4,1,0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { @@ -830,7 +821,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // auto status = kl.improveSchedule(schedule); // auto finish_time = std::chrono::high_resolution_clock::now(); // auto duration = std::chrono::duration_cast(finish_time - start_time).count(); - + // std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalLambdaCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl; // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -841,11 +832,11 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) { // // start_time = std::chrono::high_resolution_clock::now(); // // status = kl_old.improve_schedule_test_2(schedule_2); // // finish_time = std::chrono::high_resolution_clock::now(); - + // // duration = std::chrono::duration_cast(finish_time - start_time).count(); // // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl; - + // // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); // // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); diff --git a/tests/kl_total.cpp b/tests/kl_total.cpp index 5d3d1486..58421144 100644 --- a/tests/kl_total.cpp +++ b/tests/kl_total.cpp @@ -22,18 +22,17 @@ limitations under the License. #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp" +#include "osp/auxiliary/io/arch_file_reader.hpp" +#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp" +#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp" #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp" -#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp" -#include "osp/auxiliary/io/arch_file_reader.hpp" -#include "osp/auxiliary/io/hdag_graph_file_reader.hpp" -#include "test_graphs.hpp" #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp" +#include "test_graphs.hpp" using namespace osp; - template void add_mem_weights(Graph_t &dag) { @@ -49,17 +48,16 @@ void add_mem_weights(Graph_t &dag) { } template -void check_equal_affinity_table(table_t & table_1, table_t & table_2, const std::set & nodes) { +void check_equal_affinity_table(table_t &table_1, table_t &table_2, const std::set &nodes) { BOOST_CHECK_EQUAL(table_1.size(), table_2.size()); - for ( auto i : nodes) { + for (auto i : nodes) { for (size_t j = 0; j < table_1[i].size(); ++j) { for (size_t k = 0; k < table_1[i][j].size(); ++k) { BOOST_CHECK(std::abs(table_1[i][j][k] - table_2[i][j][k]) < 0.000001); - if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) { - std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl; - + if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) { + std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl; } } } @@ -102,16 +100,13 @@ BOOST_AUTO_TEST_CASE(kl_improver_smoke_test) { schedule.updateNumberOfSupersteps(); - using kl_improver_t = kl_total_comm_improver; kl_improver_t kl; - - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true); - } BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) { @@ -135,7 +130,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) { BspInstance instance; bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(), - instance.getComputationalDag()); + instance.getComputationalDag()); instance.getArchitecture().setSynchronisationCosts(5); instance.getArchitecture().setCommunicationCosts(5); @@ -147,7 +142,6 @@ BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) { BOOST_CHECK(false); } - add_mem_weights(instance.getComputationalDag()); BspSchedule schedule(instance); @@ -158,7 +152,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) { BOOST_CHECK(schedule.satisfiesPrecedenceConstraints()); kl_total_comm_improver kl; - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -170,7 +164,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_superstep_removal_test) { using graph = computational_dag_edge_idx_vector_impl_def_int_t; using VertexType = graph::vertex_idx; - + graph dag; const VertexType v1 = dag.add_vertex(2, 9, 2); @@ -200,13 +194,13 @@ BOOST_AUTO_TEST_CASE(kl_improver_superstep_removal_test) { // Create a schedule with an almost empty superstep (step 1) schedule.setAssignedProcessors({0, 0, 0, 0, 1, 1, 1, 1}); schedule.setAssignedSupersteps({0, 0, 0, 0, 1, 2, 2, 2}); - + schedule.updateNumberOfSupersteps(); unsigned original_steps = schedule.numberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + + using cost_f = kl_total_comm_cost_function; kl_improver kl; - + auto status = kl.improveSchedule(schedule); BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -250,11 +244,10 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; using kl_improver_test = kl_improver_test; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -267,13 +260,13 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true); - auto node_selection = kl.insert_gain_heap_test_penalty({2,3}); + auto node_selection = kl.insert_gain_heap_test_penalty({2, 3}); - auto& affinity = kl.get_affinity_table(); + auto &affinity = kl.get_affinity_table(); BOOST_CHECK_CLOSE(affinity[v3][0][0], 5.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v3][0][1], 4.0, 0.00001); @@ -290,41 +283,40 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) { BOOST_CHECK_CLOSE(affinity[v4][1][2], -3.5, 0.00001); auto recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "------------------------recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "------------------------recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } - std::cout << "}" << std::endl; + } + std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - + recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - } BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_penalty_test) { @@ -363,55 +355,53 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_penalty_test) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; using kl_improver_test = kl_improver_test; kl_improver_test kl; - + kl.setup_schedule(schedule); - //auto &kl_active_schedule = kl.get_active_schedule(); + // auto &kl_active_schedule = kl.get_active_schedule(); BOOST_CHECK_CLOSE(51.5, kl.get_current_cost(), 0.00001); - auto node_selection = kl.insert_gain_heap_test_penalty({7}); + auto node_selection = kl.insert_gain_heap_test_penalty({7}); auto recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "-----------recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "-----------recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } - std::cout << "}" << std::endl; + } + std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - + recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); recompute_max_gain = kl.run_inner_iteration_test(); - std::cout << "recompute max_gain: { "; - for (const auto & [key, value] : recompute_max_gain) { + std::cout << "recompute max_gain: { "; + for (const auto &[key, value] : recompute_max_gain) { std::cout << key << " "; - } + } std::cout << "}" << std::endl; BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001); - } BOOST_AUTO_TEST_CASE(kl_improver_violation_handling_test) { @@ -450,16 +440,15 @@ BOOST_AUTO_TEST_CASE(kl_improver_violation_handling_test) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; kl_improver_test kl; - + kl.setup_schedule(schedule); kl.compute_violations_test(); BOOST_CHECK_EQUAL(kl.is_feasible(), false); - + kl_improver kl_improver; kl_improver.improveSchedule(schedule); @@ -502,10 +491,9 @@ BOOST_AUTO_TEST_CASE(kl_base_1) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -529,11 +517,11 @@ BOOST_AUTO_TEST_CASE(kl_base_1) { BOOST_CHECK_EQUAL(kl.is_feasible(), false); BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001); - kl_move move_2(v2, 3.0 + 4.5 - 4.0 , 0, 0, 1, 0); + kl_move move_2(v2, 3.0 + 4.5 - 4.0, 0, 0, 1, 0); kl.apply_move_test(move_2); - BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(0), 39.0); // 42-3 + BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(0), 39.0); // 42-3 BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(0), 5.0); // 2+3 BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 1); BOOST_CHECK_EQUAL(kl.is_feasible(), false); @@ -541,7 +529,7 @@ BOOST_AUTO_TEST_CASE(kl_base_1) { kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7}); - auto& affinity = kl.get_affinity_table(); + auto &affinity = kl.get_affinity_table(); BOOST_CHECK_CLOSE(affinity[v1][0][1], 2.0 - 4.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][1][1], 0.0, 0.00001); @@ -598,10 +586,9 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -614,7 +601,7 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001); BOOST_CHECK_EQUAL(kl.is_feasible(), true); @@ -636,7 +623,7 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { BOOST_CHECK_EQUAL(kl.is_feasible(), true); BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001); - kl_move move_2(v2, -1.0 - 8.5 , 1, 1, 0, 0); + kl_move move_2(v2, -1.0 - 8.5, 1, 1, 0, 0); kl.apply_move_test(move_2); @@ -652,7 +639,7 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { BOOST_CHECK_EQUAL(kl.is_feasible(), false); BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001); - kl_move move_x(v2, -2.0 + 8.5 , 0, 0, 1, 0); + kl_move move_x(v2, -2.0 + 8.5, 0, 0, 1, 0); kl.apply_move_test(move_x); @@ -670,14 +657,13 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7}); - auto& affinity = kl.get_affinity_table(); + auto &affinity = kl.get_affinity_table(); BOOST_CHECK_CLOSE(affinity[v1][0][1], -4.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][0][2], -2.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][1][1], 2.0, 0.00001); - BOOST_CHECK_CLOSE(affinity[v1][1][2], 0.0, 0.00001); - + BOOST_CHECK_CLOSE(affinity[v1][1][2], 0.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v2][0][1], 9.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v2][0][2], 11.5, 0.00001); @@ -719,7 +705,6 @@ BOOST_AUTO_TEST_CASE(kl_base_2) { BOOST_CHECK_CLOSE(affinity[v7][1][0], 7.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v7][1][1], 8.0, 0.00001); - BOOST_CHECK_CLOSE(affinity[v8][0][0], 8.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v8][0][1], 8.5, 0.00001); @@ -763,10 +748,9 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { schedule.updateNumberOfSupersteps(); - - using cost_f = kl_total_comm_cost_function; + using cost_f = kl_total_comm_cost_function; kl_improver_test kl; - + kl.setup_schedule(schedule); auto &kl_active_schedule = kl.get_active_schedule(); @@ -779,21 +763,19 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0); BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0); - + BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4); BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true); kl.insert_gain_heap_test_penalty({0, 1, 2, 3, 4, 5, 6, 7}); - auto& affinity = kl.get_affinity_table(); - + auto &affinity = kl.get_affinity_table(); BOOST_CHECK_CLOSE(affinity[v1][0][1], 1.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][0][2], 3.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v1][1][1], 2.0, 0.00001); - BOOST_CHECK_CLOSE(affinity[v1][1][2], 16.5, 0.00001); - + BOOST_CHECK_CLOSE(affinity[v1][1][2], 16.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v2][0][1], 15, 0.00001); BOOST_CHECK_CLOSE(affinity[v2][0][2], 11.5, 0.00001); @@ -835,16 +817,13 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { BOOST_CHECK_CLOSE(affinity[v7][1][0], 7.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v7][1][1], 8.0, 0.00001); - BOOST_CHECK_CLOSE(affinity[v8][0][0], 14.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v8][0][1], 8.5, 0.00001); BOOST_CHECK_CLOSE(affinity[v8][1][0], 8.0, 0.00001); BOOST_CHECK_CLOSE(affinity[v8][1][1], 1.0, 0.00001); - } - // BOOST_AUTO_TEST_CASE(kl_improver_incremental_update_test) { // using graph = computational_dag_edge_idx_vector_impl_def_int_t; @@ -881,12 +860,11 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3}); // schedule.updateNumberOfSupersteps(); - - -// using cost_f = kl_total_comm_cost_function; + +// using cost_f = kl_total_comm_cost_function; // using kl_improver_test = kl_improver_test; // kl_improver_test kl; - + // kl.setup_schedule(schedule); // auto node_selection = kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7}); @@ -974,7 +952,6 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // }; - // BOOST_AUTO_TEST_CASE(kl_total_comm_large_test_graphs) { // std::vector filenames_graph = large_spaa_graphs(); // using graph = computational_dag_edge_idx_vector_impl_def_int_t; @@ -987,7 +964,6 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // std::cout << cwd << std::endl; // } - // for (auto &filename_graph : filenames_graph) { // GreedyBspScheduler test_scheduler; // BspInstance instance; @@ -1003,7 +979,7 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // {4,4,0,1}, // {4,4,1,0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { @@ -1031,9 +1007,9 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // auto start_time = std::chrono::high_resolution_clock::now(); // auto status = kl.improveSchedule(schedule); // auto finish_time = std::chrono::high_resolution_clock::now(); - + // auto duration = std::chrono::duration_cast(finish_time - start_time).count(); - + // std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl; // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -1044,18 +1020,17 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // // start_time = std::chrono::high_resolution_clock::now(); // // status = kl_old.improve_schedule_test_2(schedule_2); // // finish_time = std::chrono::high_resolution_clock::now(); - + // // duration = std::chrono::duration_cast(finish_time - start_time).count(); // // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl; - + // // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); // // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true); // } // } - // BOOST_AUTO_TEST_CASE(kl_total_comm_large_test_graphs_mt) { // std::vector filenames_graph = large_spaa_graphs(); // using graph = computational_dag_edge_idx_vector_impl_def_int_t; @@ -1068,7 +1043,6 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // std::cout << cwd << std::endl; // } - // for (auto &filename_graph : filenames_graph) { // GreedyBspScheduler test_scheduler; // BspInstance instance; @@ -1084,7 +1058,7 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // {4,4,0,1}, // {4,4,1,0}}; -// instance.getArchitecture().setSendCosts(send_cost); +// instance.getArchitecture().SetSendCosts(send_cost); // if (!status_graph) { @@ -1112,9 +1086,9 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // auto start_time = std::chrono::high_resolution_clock::now(); // auto status = kl.improveSchedule(schedule); // auto finish_time = std::chrono::high_resolution_clock::now(); - + // auto duration = std::chrono::duration_cast(finish_time - start_time).count(); - + // std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl; // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); @@ -1125,11 +1099,11 @@ BOOST_AUTO_TEST_CASE(kl_base_3) { // // start_time = std::chrono::high_resolution_clock::now(); // // status = kl_old.improve_schedule_test_2(schedule_2); // // finish_time = std::chrono::high_resolution_clock::now(); - + // // duration = std::chrono::duration_cast(finish_time - start_time).count(); // // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl; - + // // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND); // // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true);