diff --git a/include/osp/auxiliary/io/arch_file_reader.hpp b/include/osp/auxiliary/io/arch_file_reader.hpp
index 4e100ba8..71b0f006 100644
--- a/include/osp/auxiliary/io/arch_file_reader.hpp
+++ b/include/osp/auxiliary/io/arch_file_reader.hpp
@@ -18,10 +18,10 @@ limitations under the License.
 
 #pragma once
 
+#include "osp/bsp/model/BspArchitecture.hpp"
 #include <fstream>
 #include <iostream>
 #include <sstream>
-#include "osp/bsp/model/BspArchitecture.hpp"
 
 namespace osp { namespace file_reader {
 
@@ -31,7 +31,8 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture<Graph_t> &archit
 
     // Skip comment lines
     while (std::getline(infile, line)) {
-        if (!line.empty() && line[0] != '%') break;
+        if (!line.empty() && line[0] != '%')
+            break;
     }
 
     // Parse architecture parameters
@@ -58,24 +59,24 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture<Graph_t> &archit
     if (0 <= mem_type && mem_type <= 3) {
         using memw_t = v_memw_t<Graph_t>;
         switch (mem_type) {
-            case 0:
-                architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::NONE);
-                break;
-            case 1:
-                architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::LOCAL);
-                architecture.setMemoryBound(static_cast<memw_t>(M));
-                break;
-            case 2:
-                architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::GLOBAL);
-                architecture.setMemoryBound(static_cast<memw_t>(M));
-                break;
-            case 3:
-                architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT);
-                architecture.setMemoryBound(static_cast<memw_t>(M));
-                break;
-            default:
-                std::cerr << "Invalid memory type.\n";
-                return false;
+        case 0:
+            architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::NONE);
+            break;
+        case 1:
+            architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::LOCAL);
+            architecture.setMemoryBound(static_cast<memw_t>(M));
+            break;
+        case 2:
+            architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::GLOBAL);
+            architecture.setMemoryBound(static_cast<memw_t>(M));
+            break;
+        case 3:
+            architecture.setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT);
+            architecture.setMemoryBound(static_cast<memw_t>(M));
+            break;
+        default:
+            std::cerr << "Invalid memory type.\n";
+            return false;
         }
     } else if (mem_type == -1) {
         std::cout << "No memory type specified. Assuming \"NONE\".\n";
@@ -116,7 +117,7 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture<Graph_t> &archit
             return false;
         }
 
-        architecture.setSendCosts(fromProc, toProc, static_cast<v_commw_t<Graph_t>>(value));
+        architecture.SetSendCosts(fromProc, toProc, static_cast<v_commw_t<Graph_t>>(value));
     }
 
     // Ensure there are no remaining non-comment lines
@@ -127,7 +128,6 @@ bool readBspArchitecture(std::ifstream &infile, BspArchitecture<Graph_t> &archit
         }
     }
 
-    architecture.computeCommAverage();
     return true;
 }
 
diff --git a/include/osp/bsp/model/BspArchitecture.hpp b/include/osp/bsp/model/BspArchitecture.hpp
index 8ac1c0a8..74872aae 100644
--- a/include/osp/bsp/model/BspArchitecture.hpp
+++ b/include/osp/bsp/model/BspArchitecture.hpp
@@ -34,46 +34,51 @@ limitations under the License.
 
 namespace osp {
 
-static constexpr unsigned CacheLineSize = 64;
-
+/**
+ * @enum MEMORY_CONSTRAINT_TYPE
+ * @brief Enumerates the different types of memory constraints.
+ * Memory bounds are set per processor and apply to aggregated memory weights of nodes according to the different types of memory constraints.
+ */
 enum class MEMORY_CONSTRAINT_TYPE {
-    NONE,
-    LOCAL,
-    GLOBAL,
-    PERSISTENT_AND_TRANSIENT,
-    LOCAL_IN_OUT,
-    LOCAL_INC_EDGES,
-    LOCAL_SOURCES_INC_EDGES
+    NONE,                     /** No memory constraints. */
+    LOCAL,                    /** The memory bounds apply to the sum of memory weights of nodes assigned to the same processor and superstep. */
+    GLOBAL,                   /** The memory bounds apply to the sum of memory weights of the nodes assigned to the same processor. */
+    PERSISTENT_AND_TRANSIENT, /** Memory bounds apply to the sum of memory weights of nodes assigned to the same processor plus the maximum communication weight of a node assigned to a processor. */
+    LOCAL_IN_OUT,             /** Memory constraints are local in-out. Experimental. */
+    LOCAL_INC_EDGES,          /** Memory constraints are local incident edges. Experimental. */
+    LOCAL_SOURCES_INC_EDGES   /** Memory constraints are local source incident edges. Experimental. */
 };
 
-inline std::ostream &operator<<(std::ostream &os, MEMORY_CONSTRAINT_TYPE type) {
+/**
+ * @brief Converts the enum to a string literal.
+ * Returns const char* to avoid std::string allocation overhead.
+ */
+inline const char *to_string(MEMORY_CONSTRAINT_TYPE type) {
     switch (type) {
     case MEMORY_CONSTRAINT_TYPE::NONE:
-        os << "NONE";
-        break;
+        return "NONE";
     case MEMORY_CONSTRAINT_TYPE::LOCAL:
-        os << "LOCAL";
-        break;
+        return "LOCAL";
     case MEMORY_CONSTRAINT_TYPE::GLOBAL:
-        os << "GLOBAL";
-        break;
+        return "GLOBAL";
     case MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT:
-        os << "PERSISTENT_AND_TRANSIENT";
-        break;
+        return "PERSISTENT_AND_TRANSIENT";
     case MEMORY_CONSTRAINT_TYPE::LOCAL_IN_OUT:
-        os << "LOCAL_IN_OUT";
-        break;
+        return "LOCAL_IN_OUT";
     case MEMORY_CONSTRAINT_TYPE::LOCAL_INC_EDGES:
-        os << "LOCAL_INC_EDGES";
-        break;
+        return "LOCAL_INC_EDGES";
     case MEMORY_CONSTRAINT_TYPE::LOCAL_SOURCES_INC_EDGES:
-        os << "LOCAL_SOURCES_INC_EDGES";
-        break;
+        return "LOCAL_SOURCES_INC_EDGES";
     default:
-        os << "UNKNOWN";
-        break;
+        return "UNKNOWN";
     }
-    return os;
+}
+
+/**
+ * @brief Stream operator overload using the helper function.
+ */
+inline std::ostream &operator<<(std::ostream &os, MEMORY_CONSTRAINT_TYPE type) {
+    return os << to_string(type);
 }
 
 /**
@@ -81,8 +86,28 @@ inline std::ostream &operator<<(std::ostream &os, MEMORY_CONSTRAINT_TYPE type) {
  * @brief Represents the architecture of a BSP (Bulk Synchronous Parallel) system.
  *
  * The BspArchitecture class stores information about the number of processors, communication costs,
- * synchronization costs, and send costs between processors in a BSP system. It provides methods to
- * set and retrieve these values.
+ * synchronization costs, the send costs between processors, the types of processors, and the memory
+ * bounds. It provides methods to set and retrieve these values.
+ *
+ * **Processors:**
+ * The architecture consists of p processors, indexed from 0 to p-1.
+ *
+ * **Processor Types:**
+ * Processors can have different types, which are represented by non-negative integers.
+ * Processor types are assumed to be consecutive integers starting from 0.
+ *
+ * **Communication and Synchronization Costs:**
+ * - Communication Cost (g): The cost of communicating a unit of data between processors, i.e., the bandwidth.
+ * - Synchronization Cost (L): The cost of synchronizing all processors at the end of a superstep.
+ *
+ * **Send Costs (NUMA):**
+ * The architecture supports Non-Uniform Memory Access (NUMA) effects via a send cost matrix.
+ * The cost to send data from processor i to processor j is given by g * sendCosts[i][j].
+ * By default, send costs are uniform (1 for distinct processors, 0 for self).
+ *
+ * **Memory Constraints:**
+ * Each processor has a memory bound. The `MEMORY_CONSTRAINT_TYPE` determines how these bounds are applied
+ * (e.g., local per superstep, global per processor).
  */
 template<typename Graph_t>
 class BspArchitecture {
@@ -90,84 +115,128 @@ class BspArchitecture {
     static_assert(is_computational_dag_v<Graph_t>, "BspSchedule can only be used with computational DAGs.");
 
   private:
-    unsigned number_processors;
-    unsigned number_of_processor_types;
+    /** @brief The number of processors in the architecture. Must be at least 1. */
+    unsigned numberOfProcessors_;
+
+    /** @brief The number of processor types in the architecture. See processorTypes_ for more details. */
+    unsigned numberOfProcessorTypes_;
+
+    /** @brief The communication costs, typically denoted 'g' for the BSP model. */
+    v_commw_t<Graph_t> communicationCosts_;
 
-    v_commw_t<Graph_t> communication_costs;
-    v_commw_t<Graph_t> synchronisation_costs;
+    /** @brief The synchronisation costs, typically denoted 'L' for the BSP model. */
+    v_commw_t<Graph_t> synchronisationCosts_;
 
-    std::vector<v_memw_t<Graph_t>> memory_bound;
+    /** @brief The architecture allows to specify memory bounds per processor. */
+    std::vector<v_memw_t<Graph_t>> memoryBound_;
 
-    bool isNuma;
+    /** @brief Flag to indicate whether the architecture is NUMA , i.e., whether the send costs are different for different pairs of processors. */
+    bool isNuma_;
 
-    std::vector<unsigned> processor_type;
+    /** @brief The architecture allows to specify processor types. Processor types are used to express compatabilities, which can be specified in the BspInstance, regarding node types. */
+    std::vector<unsigned> processorTypes_;
 
-    std::vector<std::vector<v_commw_t<Graph_t>>> send_costs;
+    /** @brief A flattened p x p matrix of send costs. Access via index [i * numberOfProcessors_ + j]. */
+    std::vector<v_commw_t<Graph_t>> sendCosts_;
 
-    MEMORY_CONSTRAINT_TYPE memory_const_type = MEMORY_CONSTRAINT_TYPE::NONE;
+    /** @brief The memory constraint type. */
+    MEMORY_CONSTRAINT_TYPE memoryConstraintType_ = MEMORY_CONSTRAINT_TYPE::NONE;
 
-    bool are_send_cost_numa() {
-        if (number_processors == 1)
+    /** @brief Helper function to calculate the index of a flattened p x p matrix. */
+    std::size_t FlatIndex(const unsigned row, const unsigned col) const {
+        return static_cast<std::size_t>(row) * numberOfProcessors_ + col;
+    }
+
+    bool AreSendCostsNuma() {
+        if (numberOfProcessors_ == 1U)
             return false;
 
-        v_commw_t<Graph_t> val = send_costs[0][1];
-        for (unsigned p1 = 0; p1 < number_processors; p1++) {
-            for (unsigned p2 = 0; p2 < number_processors; p2++) {
+        const v_commw_t<Graph_t> val = sendCosts_[1U];
+        for (unsigned p1 = 0U; p1 < numberOfProcessors_; p1++) {
+            for (unsigned p2 = 0U; p2 < numberOfProcessors_; p2++) {
                 if (p1 == p2)
                     continue;
-                if (send_costs[p1][p2] != val)
+                if (sendCosts_[FlatIndex(p1, p2)] != val)
                     return true;
             }
         }
         return false;
     }
 
+    void UpdateNumberOfProcessorTypes() {
+        numberOfProcessorTypes_ = 0U;
+        for (unsigned p = 0U; p < numberOfProcessors_; p++) {
+            if (processorTypes_[p] >= numberOfProcessorTypes_) {
+                numberOfProcessorTypes_ = processorTypes_[p] + 1U;
+            }
+        }
+    }
+
+    void SetSendCostDiagonalToZero() {
+        for (unsigned i = 0U; i < numberOfProcessors_; i++) {
+            sendCosts_[FlatIndex(i, i)] = 0U;
+        }
+    }
+
+    void InitializeUniformSendCosts() {
+        sendCosts_.assign(numberOfProcessors_ * numberOfProcessors_, 1U);
+        SetSendCostDiagonalToZero();
+        isNuma_ = false;
+    }
+
   public:
+    /**
+     * @brief Default constructor.
+     * Initializes a BSP architecture with 2 processors, 1 processor type,
+     * communication costs of 1, synchronisation costs of 2, memory bounds of 100,
+     * and send costs of 1 between all processors.
+     */
     BspArchitecture()
-        : number_processors(2), number_of_processor_types(1), communication_costs(1), synchronisation_costs(2),
-          memory_bound(std::vector<v_memw_t<Graph_t>>(number_processors, 100)), isNuma(false),
-          processor_type(std::vector<unsigned>(number_processors, 0)),
-          send_costs(std::vector<std::vector<v_commw_t<Graph_t>>>(
-              number_processors, std::vector<v_commw_t<Graph_t>>(number_processors, 1))) {
-        for (unsigned i = 0; i < number_processors; i++) {
-            send_costs[i][i] = 0;
-        }
+        : numberOfProcessors_(2U), numberOfProcessorTypes_(1U), communicationCosts_(1U), synchronisationCosts_(2U),
+          memoryBound_(numberOfProcessors_, 100U), isNuma_(false),
+          processorTypes_(numberOfProcessors_, 0U), sendCosts_(numberOfProcessors_ * numberOfProcessors_, 1U) {
+        SetSendCostDiagonalToZero();
     }
 
     BspArchitecture(const BspArchitecture &other) = default;
-    BspArchitecture(BspArchitecture &&other) = default;
+    BspArchitecture(BspArchitecture &&other) noexcept = default;
     BspArchitecture &operator=(const BspArchitecture &other) = default;
-    BspArchitecture &operator=(BspArchitecture &&other) = default;
-    ~BspArchitecture() = default;
+    BspArchitecture &operator=(BspArchitecture &&other) noexcept = default;
+    virtual ~BspArchitecture() = default;
 
     /**
      * @brief Constructs a BspArchitecture object with the specified number of processors, communication cost, and
      * synchronization cost.
      *
-     * @param processors The number of processors in the architecture.
-     * @param comm_cost The communication cost between processors.
-     * @param synch_cost The synchronization cost between processors.
-     */
-    BspArchitecture(unsigned processors, v_commw_t<Graph_t> comm_cost, v_commw_t<Graph_t> synch_cost,
-                    v_memw_t<Graph_t> memory_bound_ = 100)
-        : number_processors(processors), number_of_processor_types(1), communication_costs(comm_cost),
-          synchronisation_costs(synch_cost),
-          memory_bound(std::vector<v_memw_t<Graph_t>>(number_processors, memory_bound_)), isNuma(false),
-          processor_type(std::vector<unsigned>(number_processors, 0)),
-          send_costs(std::vector<std::vector<v_commw_t<Graph_t>>>(
-              number_processors, std::vector<v_commw_t<Graph_t>>(number_processors, 1))) {
-
-        for (unsigned i = 0; i < number_processors; i++) {
-            send_costs[i][i] = 0;
+     * @param NumberOfProcessors The number of processors in the architecture. Must be greater than 0.
+     * @param CommunicationCost The communication cost between processors.
+     * @param SynchronisationCost The synchronization cost between processors.
+     * @param MemoryBound The memory bound for each processor (default: 100).
+     */
+    BspArchitecture(const unsigned NumberOfProcessors, const v_commw_t<Graph_t> CommunicationCost, const v_commw_t<Graph_t> SynchronisationCost,
+                    const v_memw_t<Graph_t> MemoryBound = 100U)
+        : numberOfProcessors_(NumberOfProcessors), numberOfProcessorTypes_(1U), communicationCosts_(CommunicationCost),
+          synchronisationCosts_(SynchronisationCost),
+          memoryBound_(NumberOfProcessors, MemoryBound), isNuma_(false),
+          processorTypes_(NumberOfProcessors, 0U), sendCosts_(NumberOfProcessors * NumberOfProcessors, 1U) {
+        if (NumberOfProcessors == 0U) {
+            throw std::runtime_error("BspArchitecture: Number of processors must be greater than 0.");
         }
+        SetSendCostDiagonalToZero();
     }
 
+    /**
+     * @brief Copy constructor from a BspArchitecture with a different graph type.
+     *
+     * @tparam Graph_t_other The graph type of the other BspArchitecture.
+     * @param other The other BspArchitecture object.
+     */
     template<typename Graph_t_other>
     BspArchitecture(const BspArchitecture<Graph_t_other> &other)
-        : number_processors(other.numberOfProcessors()), number_of_processor_types(other.getNumberOfProcessorTypes()),
-          communication_costs(other.communicationCosts()), synchronisation_costs(other.synchronisationCosts()),
-          memory_bound(other.memoryBound()), isNuma(other.isNumaArchitecture()), processor_type(other.processorTypes()),
-          send_costs(other.sendCosts()) {
+        : numberOfProcessors_(other.numberOfProcessors()), numberOfProcessorTypes_(other.getNumberOfProcessorTypes()),
+          communicationCosts_(other.communicationCosts()), synchronisationCosts_(other.synchronisationCosts()),
+          memoryBound_(other.memoryBound()), isNuma_(other.isNumaArchitecture()), processorTypes_(other.processorTypes()),
+          sendCosts_(other.sendCostsVector()) {
 
         static_assert(std::is_same_v<v_memw_t<Graph_t>, v_memw_t<Graph_t_other>>,
                       "BspArchitecture: Graph_t and Graph_t_other have the same memory weight type.");
@@ -180,81 +249,87 @@ class BspArchitecture {
     }
 
     /**
-     * @brief Constructs a BspArchitecture object with the specified number of processors, communication cost, and
-     * synchronization cost.
+     * @brief Constructs a BspArchitecture object with custom send costs.
      *
-     * @param processors The number of processors in the architecture.
-     * @param comm_cost The communication cost between processors.
-     * @param synch_cost The synchronization cost between processors.
-     */
-    BspArchitecture(unsigned int processors, v_commw_t<Graph_t> comm_cost, v_commw_t<Graph_t> synch_cost,
-                    std::vector<std::vector<v_commw_t<Graph_t>>> send_costs_)
-        : number_processors(processors), number_of_processor_types(1), communication_costs(comm_cost),
-          synchronisation_costs(synch_cost), memory_bound(std::vector<v_memw_t<Graph_t>>(number_processors, 100)),
-          processor_type(std::vector<unsigned>(number_processors, 0)), send_costs(send_costs_) {
-
-        if (number_processors != send_costs.size()) {
-            throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n");
+     * @param NumberOfProcessors The number of processors. Must be greater than 0.
+     * @param CommunicationCost The communication cost.
+     * @param SynchronisationCost The synchronization cost.
+     * @param SendCosts The matrix of send costs between processors. Needs to be a processors x processors matrix. Diagonal entries are forced to zero.
+     */
+    BspArchitecture(const unsigned NumberOfProcessors, const v_commw_t<Graph_t> CommunicationCost, const v_commw_t<Graph_t> SynchronisationCost,
+                    const std::vector<std::vector<v_commw_t<Graph_t>>> &SendCosts)
+        : numberOfProcessors_(NumberOfProcessors), numberOfProcessorTypes_(1U), communicationCosts_(CommunicationCost),
+          synchronisationCosts_(SynchronisationCost), memoryBound_(NumberOfProcessors, 100U),
+          processorTypes_(NumberOfProcessors, 0U) {
+        if (NumberOfProcessors == 0U) {
+            throw std::runtime_error("BspArchitecture: Number of processors must be greater than 0.");
+        }
+        if (NumberOfProcessors != SendCosts.size()) {
+            throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n");
         }
-        if (std::any_of(send_costs.begin(), send_costs.end(),
-                        [processors](const auto &thing) { return thing.size() != processors; })) {
-            throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n");
+        if (std::any_of(SendCosts.begin(), SendCosts.end(),
+                        [NumberOfProcessors](const auto &thing) { return thing.size() != NumberOfProcessors; })) {
+            throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n");
         }
 
-        for (unsigned i = 0; i < number_processors; i++) {
-            send_costs[i][i] = 0;
+        sendCosts_.reserve(NumberOfProcessors * NumberOfProcessors);
+        for (const auto &row : SendCosts) {
+            sendCosts_.insert(sendCosts_.end(), row.begin(), row.end());
         }
 
-        isNuma = are_send_cost_numa();
+        SetSendCostDiagonalToZero();
+        isNuma_ = AreSendCostsNuma();
     }
 
     /**
-     * @brief Constructs a BspArchitecture object with the specified number of processors, communication cost, and
-     * synchronization cost.
+     * @brief Constructs a BspArchitecture object with custom send costs and memory bound.
      *
-     * @param processors The number of processors in the architecture.
-     * @param comm_cost The communication cost between processors.
-     * @param synch_cost The synchronization cost between processors.
-     */
-    BspArchitecture(unsigned int processors, v_commw_t<Graph_t> comm_cost, v_commw_t<Graph_t> synch_cost,
-                    v_memw_t<Graph_t> memory_bound_, std::vector<std::vector<v_commw_t<Graph_t>>> send_costs_)
-        : number_processors(processors), number_of_processor_types(1), communication_costs(comm_cost),
-          synchronisation_costs(synch_cost),
-          memory_bound(std::vector<v_memw_t<Graph_t>>(number_processors, memory_bound_)),
-          processor_type(std::vector<unsigned>(number_processors, 0)), send_costs(send_costs_) {
-
-        if (number_processors != send_costs.size()) {
-            throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n");
+     * @param NumberOfProcessors The number of processors. Must be greater than 0.
+     * @param CommunicationCost The communication cost.
+     * @param SynchronisationCost The synchronization cost.
+     * @param MemoryBound The memory bound for each processor.
+     * @param SendCosts The matrix of send costs between processors. Needs to be a processors x processors matrix. Diagonal entries are forced to zero.
+     */
+    BspArchitecture(const unsigned NumberOfProcessors, const v_commw_t<Graph_t> CommunicationCost, const v_commw_t<Graph_t> SynchronisationCost,
+                    const v_memw_t<Graph_t> MemoryBound, const std::vector<std::vector<v_commw_t<Graph_t>>> &SendCosts)
+        : numberOfProcessors_(NumberOfProcessors), numberOfProcessorTypes_(1U), communicationCosts_(CommunicationCost),
+          synchronisationCosts_(SynchronisationCost), memoryBound_(NumberOfProcessors, MemoryBound),
+          processorTypes_(NumberOfProcessors, 0U) {
+        if (NumberOfProcessors == 0U) {
+            throw std::runtime_error("BspArchitecture: Number of processors must be greater than 0.");
+        }
+        if (NumberOfProcessors != SendCosts.size()) {
+            throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n");
         }
-        if (std::any_of(send_costs.begin(), send_costs.end(),
-                        [processors](const auto &thing) { return thing.size() != processors; })) {
-            throw std::invalid_argument("send_costs_ needs to be a processors x processors matrix.\n");
+        if (std::any_of(SendCosts.begin(), SendCosts.end(),
+                        [NumberOfProcessors](const auto &thing) { return thing.size() != NumberOfProcessors; })) {
+            throw std::invalid_argument("sendCosts_ needs to be a processors x processors matrix.\n");
         }
 
-        for (unsigned i = 0u; i < number_processors; i++) {
-            send_costs[i][i] = 0u;
+        sendCosts_.reserve(NumberOfProcessors * NumberOfProcessors);
+        for (const auto &row : SendCosts) {
+            sendCosts_.insert(sendCosts_.end(), row.begin(), row.end());
         }
 
-        isNuma = are_send_cost_numa();
+        SetSendCostDiagonalToZero();
+        isNuma_ = AreSendCostsNuma();
     }
 
     /**
-     * Sets the uniform send cost for each pair of processors in the BSP architecture.
+     * @brief Sets the uniform send cost for each pair of processors.
      * The send cost is set to 0 if the processors are the same, and 1 otherwise.
-     * This function assumes that the number of processors has already been set.
      */
     void SetUniformSendCost() {
-
-        for (unsigned i = 0; i < number_processors; i++) {
-            for (unsigned j = 0; j < number_processors; j++) {
+        for (unsigned i = 0U; i < numberOfProcessors_; i++) {
+            for (unsigned j = 0U; j < numberOfProcessors_; j++) {
                 if (i == j) {
-                    send_costs[i][j] = 0;
+                    sendCosts_[FlatIndex(i, j)] = 0U;
                 } else {
-                    send_costs[i][j] = 1;
+                    sendCosts_[FlatIndex(i, j)] = 1U;
                 }
             }
         }
-        isNuma = false;
+        isNuma_ = false;
     }
 
     /**
@@ -265,80 +340,59 @@ class BspArchitecture {
      *
      * @param base The base value used to calculate the send cost.
      */
-    void SetExpSendCost(v_commw_t<Graph_t> base) {
-
-        isNuma = true;
+    void SetExpSendCost(const v_commw_t<Graph_t> base) {
+        isNuma_ = true;
 
         unsigned maxPos = 1;
         constexpr unsigned two = 2;
-        for (; intpow(two, maxPos + 1) <= number_processors - 1; ++maxPos) {
+        for (; intpow(two, maxPos + 1) <= numberOfProcessors_ - 1; ++maxPos) {
         }
-        for (unsigned i = 0; i < number_processors; ++i)
-            for (unsigned j = i + 1; j < number_processors; ++j)
-                for (unsigned pos = maxPos; pos <= maxPos; --pos)
-                    if (((1 << pos) & i) != ((1 << pos) & j)) {
-                        send_costs[i][j] = send_costs[j][i] = intpow(base, pos);
+
+        for (unsigned i = 0U; i < numberOfProcessors_; ++i) {
+            for (unsigned j = i + 1U; j < numberOfProcessors_; ++j) {
+                // Corrected loop to avoid underflow issues with unsigned
+                for (int pos = static_cast<int>(maxPos); pos >= 0; --pos) {
+                    if (((1U << pos) & i) != ((1U << pos) & j)) {
+                        sendCosts_[FlatIndex(i, j)] = sendCosts_[FlatIndex(j, i)] = intpow(base, static_cast<unsigned>(pos));
                         break;
                     }
+                }
+            }
+        }
     }
 
-    inline auto processors() const { return integral_range<unsigned>(number_processors); }    
-
     /**
-     * @brief Computes the average communication cost of the BspArchitecture.
-     *
-     * This function computes the average communication cost of the BspArchitecture object.
-     * The average communication cost is calculated as the sum of the send costs between processors divided by the
-     * number of processors.
-     *
-     * @return The average communication cost as an unsigned integer.
+     * @brief Returns a view of processor indices from 0 to numberOfProcessors_ - 1.
+     * @return An integral view of processor indices.
      */
-    v_commw_t<Graph_t> computeCommAverage() const {
-
-        double avg = 0;
-        for (unsigned i = 0; i < number_processors; ++i)
-            for (unsigned j = 0; j < number_processors; ++j)
-                avg += static_cast<double>(send_costs[i][j]);
-        avg = avg * static_cast<double>(communication_costs) / static_cast<double>(number_processors) / static_cast<double>(number_processors);
-
-        if (avg > static_cast<double>(std::numeric_limits<unsigned>::max())) {
-            throw std::invalid_argument("avg comm exceeds the limit (something is very wrong)");
-        }
-
-        return static_cast<v_commw_t<Graph_t>>(std::round(avg));
-    }
+    [[nodiscard]] auto processors() const { return integral_range<unsigned>(numberOfProcessors_); }
 
     /**
-     * Sets the send costs for the BspArchitecture.
+     * @brief Sets the send costs for the BspArchitecture.
      *
      * @param vec A 2D vector representing the send costs between processors.
-     *            The size of the vector must be equal to the number of processors.
-     *            Each inner vector must also have a size equal to the number of processors.
-     * @throws std::invalid_argument if the size of the vector or inner vectors is invalid.
+     * @throws std::invalid_argument if the size of the vector is invalid or diagonal elements are not 0.
      */
-    void setSendCosts(const std::vector<std::vector<v_commw_t<Graph_t>>> &vec) {
-
-        if (vec.size() != number_processors) {
-            throw std::invalid_argument("Invalid Argument");
+    void SetSendCosts(const std::vector<std::vector<v_commw_t<Graph_t>>> &vec) {
+        if (vec.size() != numberOfProcessors_) {
+            throw std::invalid_argument("Invalid Argument: Vector size mismatch.");
         }
 
-        isNuma = false;
-        for (unsigned i = 0; i < number_processors; i++) {
-
-            if (vec[i].size() != number_processors) {
-                throw std::invalid_argument("Invalid Argument");
+        isNuma_ = false;
+        for (unsigned i = 0U; i < numberOfProcessors_; i++) {
+            if (vec.at(i).size() != numberOfProcessors_) {
+                throw std::invalid_argument("Invalid Argument: Inner vector size mismatch.");
             }
 
-            for (unsigned j = 0; j < number_processors; j++) {
-
+            for (unsigned j = 0U; j < numberOfProcessors_; j++) {
                 if (i == j) {
-                    if (vec[i][j] != 0)
-                        throw std::invalid_argument("Invalid Argument, Diagonal elements should be 0");
+                    if (vec.at(i).at(j) != 0U)
+                        throw std::invalid_argument("Invalid Argument: Diagonal elements should be 0.");
                 } else {
-                    send_costs[i][j] = vec[i][j];
+                    sendCosts_.at(FlatIndex(i, j)) = vec.at(i).at(j);
 
-                    if (number_processors > 1 && vec[i][j] != vec[0][1]) {
-                        isNuma = true;
+                    if (numberOfProcessors_ > 1U && vec.at(i).at(j) != vec.at(0U).at(1U)) {
+                        isNuma_ = true;
                     }
                 }
             }
@@ -346,324 +400,309 @@ class BspArchitecture {
     }
 
     /**
-     * Sets the send costs between two processors.
+     * @brief Sets the send costs between two processors.
      *
-     * @param p1 The index of the first processor.
-     * @param p2 The index of the second processor.
+     * @param p1 The index of the first processor. Must be less than numberOfProcessors_.
+     * @param p2 The index of the second processor. Must be less than numberOfProcessors_.
      * @param cost The cost of sending data between the processors.
-     *
-     * @remarks If the two processors are the same, the send cost is not set.
-     *          If the cost is not equal to 1, the architecture is considered NUMA.
+     * @throws std::invalid_argument if the processor indices are out of bounds.
      */
-    void setSendCosts(unsigned p1, unsigned p2, v_commw_t<Graph_t> cost) {
-
-        if (p1 >= number_processors || p2 > number_processors)
-            throw std::invalid_argument("Invalid Argument");
+    void SetSendCosts(const unsigned p1, const unsigned p2, const v_commw_t<Graph_t> cost) {
+        if (p1 >= numberOfProcessors_ || p2 >= numberOfProcessors_) // Fixed condition: p2 >= number_processors
+            throw std::invalid_argument("Invalid Argument: Processor index out of bounds.");
 
         if (p1 != p2) {
-            send_costs[p1][p2] = cost;
-
-            isNuma = are_send_cost_numa();
+            sendCosts_.at(FlatIndex(p1, p2)) = cost;
+            isNuma_ = AreSendCostsNuma();
         }
     }
 
     /**
-     * Sets the memory bound for all processors of the BspArchitecture.
-     *
-     * @param memory_bound_ The new memory bound for all processors.
+     * @brief Sets the memory bound for all processors.
+     * @param MemoryBound The new memory bound for all processors.
      */
-    inline void setMemoryBound(v_memw_t<Graph_t> memory_bound_) {
-        memory_bound = std::vector<v_memw_t<Graph_t>>(number_processors, memory_bound_);
+    void setMemoryBound(const v_memw_t<Graph_t> MemoryBound) {
+        memoryBound_.assign(numberOfProcessors_, MemoryBound);
     }
 
-    inline void setMemoryBound(const std::vector<v_memw_t<Graph_t>> &memory_bound_) { memory_bound = memory_bound_; }
-
-    inline void setMemoryBound(v_memw_t<Graph_t> memory_bound_, unsigned proc) {
-
-        if (proc >= number_processors) {
-            throw std::invalid_argument("Invalid Argument setMemoryBound");
+    /**
+     * @brief Sets the memory bound for all processors using a vector.
+     * @param MemoryBound The vector of memory bounds.
+     * @throws std::invalid_argument if the size of the vector is invalid.
+     */
+    void setMemoryBound(const std::vector<v_memw_t<Graph_t>> &MemoryBound) {
+        if (MemoryBound.size() != numberOfProcessors_) {
+            throw std::invalid_argument("Invalid Argument: Memory bound vector size does not match number of processors.");
         }
+        memoryBound_ = MemoryBound;
+    }
 
-        memory_bound[proc] = memory_bound_;
+    /**
+     * @brief Sets the memory bound for a specific processor.
+     * @param MemoryBound The new memory bound for the processor.
+     * @param processorIndex The processor index. Must be less than numberOfProcessors_.
+     */
+    void setMemoryBound(const v_memw_t<Graph_t> MemoryBound, const unsigned processorIndex) {
+        memoryBound_.at(processorIndex) = MemoryBound;
     }
 
     /**
-     * @brief Sets the synchronization costs for the BspArchitecture.
-     *
-     * This function sets the synchronization costs for the BspArchitecture object.
-     * The synchronization costs represent the costs of establishing communication between processors.
-     *
-     * @param synch_cost The synchronization costs to be set.
+     * @brief Sets the synchronization costs.
+     * @param SynchCost The new synchronization costs.
      */
-    inline void setSynchronisationCosts(v_commw_t<Graph_t> synch_cost) { synchronisation_costs = synch_cost; }
+    void setSynchronisationCosts(const v_commw_t<Graph_t> SynchCost) { synchronisationCosts_ = SynchCost; }
 
     /**
-     * @brief Sets the communication costs for the BspArchitecture.
-     *
-     * This function sets the communication costs for the BspArchitecture object.
-     * The communication costs represent the costs of sending messages between processors.
-     *
-     * @param comm_cost The communication costs to be set.
+     * @brief Sets the communication costs.
+     * @param CommCost The new communication costs.
      */
-    inline void setCommunicationCosts(v_commw_t<Graph_t> comm_cost) { communication_costs = comm_cost; }
+    void setCommunicationCosts(const v_commw_t<Graph_t> CommCost) { communicationCosts_ = CommCost; }
 
     /**
-     * @brief Sets the number of processors in the BSP architecture.
-     *
-     * This function sets the number of processors in the BSP architecture and sets the send costs between processors
-     * to 1. The send_costs matrix represents the costs of sending messages between processors. The diagonal elements of
-     * the matrix are set to 0, indicating that there is no cost to send a message from a processor to itself.
-     *
-     * @param num_proc The number of processors in the BSP architecture.
+     * @brief Checks if the architecture is NUMA.
+     * @return True if NUMA, false otherwise.
      */
-    void setNumberOfProcessors(unsigned num_proc) {
+    [[nodiscard]] bool isNumaArchitecture() const { return isNuma_; }
 
-        number_processors = num_proc;
-        number_of_processor_types = 1;
-        processor_type = std::vector<unsigned>(number_processors, 0);
-        send_costs = std::vector<std::vector<v_commw_t<Graph_t>>>(
-            number_processors, std::vector<v_commw_t<Graph_t>>(number_processors, 1));
-        for (unsigned i = 0; i < number_processors; i++) {
-            send_costs[i][i] = 0;
+    /**
+     * @brief Sets the number of processors. Processor type is set to 0 for all processors.
+     * Resets send costs to uniform (1) and diagonal to 0. The memory bound is set to 100 for all processors.
+     * @param numberOfProcessors The number of processors. Must be greater than 0.
+     * @throws std::invalid_argument if the number of processors is 0.
+     */
+    void setNumberOfProcessors(const unsigned numberOfProcessors) {
+        if (numberOfProcessors == 0) {
+            throw std::invalid_argument("Invalid Argument: Number of processors must be greater than 0.");
         }
-        memory_bound.resize(num_proc, memory_bound.back());
+        numberOfProcessors_ = numberOfProcessors;
+        numberOfProcessorTypes_ = 1U;
+        processorTypes_.assign(numberOfProcessors_, 0U);
+
+        InitializeUniformSendCosts();
 
-        isNuma = false;
+        // initialize memory bound to 100 for all processors
+        memoryBound_.assign(numberOfProcessors_, 100U);
     }
 
     /**
-     * @brief Sets the number of processors and their types in the BSP architecture.
-     *
-     * This function sets the number of processors in the BSP architecture and sets the send costs between processors
-     * to 1. The send_costs matrix represents the costs of sending messages between processors. The diagonal elements of
-     * the matrix are set to 0, indicating that there is no cost to send a message from a processor to itself.
-     *
-     * @param processor_types_ The type of the respective processors.
+     * @brief Sets the number of processors and their types. Number of processors is set to the size of the processor types vector.
+     * Resets send costs to uniform (1). Resets memory bound to 100 for all processors.
+     * @param processorTypes The types of the respective processors.
      */
-    void setProcessorsWithTypes(const std::vector<v_type_t<Graph_t>> &processor_types_) {
-
-        if (processor_types_.size() > std::numeric_limits<unsigned>::max()) {
-            throw std::invalid_argument("Invalid Argument, number of processors exceeds the limit");
+    void setProcessorsWithTypes(const std::vector<v_type_t<Graph_t>> &processorTypes) {
+        if (processorTypes.empty()) {
+            throw std::invalid_argument("Invalid Argument: Processor types vector is empty.");
         }
-
-        number_processors = static_cast<unsigned>(processor_types_.size());
-
-        number_of_processor_types = 0;
-        processor_type = processor_types_;
-        send_costs = std::vector<std::vector<v_commw_t<Graph_t>>>(
-            number_processors, std::vector<v_commw_t<Graph_t>>(number_processors, 1));
-        for (unsigned i = 0; i < number_processors; i++) {
-            send_costs[i][i] = 0;
+        if (processorTypes.size() > std::numeric_limits<unsigned>::max()) {
+            throw std::invalid_argument("Invalid Argument: Number of processors exceeds the limit.");
         }
-        memory_bound.resize(number_processors, memory_bound.back());
+        numberOfProcessors_ = static_cast<unsigned>(processorTypes.size());
+        processorTypes_ = processorTypes;
+
+        InitializeUniformSendCosts();
 
-        isNuma = false;
-        updateNumberOfProcessorTypes();
+        // initialize memory bound to 100 for all processors
+        memoryBound_.assign(numberOfProcessors_, 100U);
+        UpdateNumberOfProcessorTypes();
     }
 
     /**
-     * Returns whether the architecture is NUMA.
-     *
-     * @return True if the architecture is NUMA, false otherwise.
+     * @brief Sets processors based on counts of consecutive types.
+     * The architecture will have processorTypeCount[0] processors of type 0, processorTypeCount[1] processors of type 1, etc.
+     * The memory bound for each processor of type i is set to processorTypeMemory[i].
+     * The send costs are set to uniform (1).
+     * @param processorTypeCount Vector where index is type and value is count of processors of that type.
+     * @param processorTypeMemory Vector where index is type and value is memory bound for that type.
      */
-    inline bool isNumaArchitecture() const { return isNuma; }
-
-    void set_processors_consequ_types(const std::vector<v_type_t<Graph_t>> &processor_type_count_,
-                                      const std::vector<v_memw_t<Graph_t>> &processor_type_memory_) {
-
-        if (processor_type_count_.size() != processor_type_memory_.size()) {
-            throw std::invalid_argument(
-                "Invalid Argument, processor_type_count_ and processor_type_memory_ must have the same size");
+    void SetProcessorsConsequTypes(const std::vector<v_type_t<Graph_t>> &processorTypeCount,
+                                   const std::vector<v_memw_t<Graph_t>> &processorTypeMemory) {
+        if (processorTypeCount.size() != processorTypeMemory.size()) {
+            throw std::invalid_argument("Invalid Argument: processorTypeCount and processorTypeMemory must have the same size.");
         }
 
-        if (processor_type_count_.size() > std::numeric_limits<unsigned>::max()) {
-            throw std::invalid_argument("Invalid Argument, number of processors exceeds the limit");
+        if (processorTypeCount.size() > std::numeric_limits<unsigned>::max()) {
+            throw std::invalid_argument("Invalid Argument: Number of processors exceeds the limit.");
         }
 
-        number_of_processor_types = static_cast<unsigned>(processor_type_count_.size());
-        number_processors = std::accumulate(processor_type_count_.begin(), processor_type_count_.end(), 0u);
+        numberOfProcessorTypes_ = static_cast<unsigned>(processorTypeCount.size());
+        numberOfProcessors_ = std::accumulate(processorTypeCount.begin(), processorTypeCount.end(), 0U);
 
-        processor_type = std::vector<v_type_t<Graph_t>>(number_processors, 0);
-        memory_bound = std::vector<v_memw_t<Graph_t>>(number_processors, 0);
+        // initialize processor types and memory bound
+        processorTypes_.assign(numberOfProcessors_, 0U);
+        memoryBound_.assign(numberOfProcessors_, 0U);
 
-        unsigned offset = 0;
-        for (unsigned i = 0; i < processor_type_count_.size(); i++) {
-
-            for (unsigned j = 0; j < processor_type_count_[i]; j++) {
-                processor_type[offset + j] = i;
-                memory_bound[offset + j] = processor_type_memory_[i];
+        unsigned offset = 0U;
+        for (unsigned i = 0U; i < processorTypeCount.size(); i++) {
+            for (unsigned j = 0U; j < processorTypeCount.at(i); j++) {
+                processorTypes_.at(offset + j) = i;
+                memoryBound_.at(offset + j) = processorTypeMemory.at(i);
             }
-            offset += processor_type_count_[i];
+            offset += processorTypeCount.at(i);
         }
 
-        send_costs = std::vector<std::vector<v_commw_t<Graph_t>>>(
-            number_processors, std::vector<v_commw_t<Graph_t>>(number_processors, 1));
-        for (unsigned i = 0; i < number_processors; i++) {
-            send_costs[i][i] = 0;
-        }
-        isNuma = false;
+        InitializeUniformSendCosts();
     }
 
     /**
-     * Returns the memory bound of the BspArchitecture.
-     *
-     * @return The memory bound as an unsigned integer.
+     * @brief Returns the memory bounds of all processors.
+     * @return Vector of memory bounds.
      */
-    inline const std::vector<v_memw_t<Graph_t>> &memoryBound() const { return memory_bound; }
+    [[nodiscard]] const std::vector<v_memw_t<Graph_t>> &memoryBound() const { return memoryBound_; }
 
-    inline v_memw_t<Graph_t> memoryBound(unsigned proc) const { return memory_bound[proc]; }
+    /**
+     * @brief Returns the memory bound of a specific processor.
+     * @param proc The processor index.
+     * @return The memory bound.
+     */
+    [[nodiscard]] v_memw_t<Graph_t> memoryBound(const unsigned proc) const { return memoryBound_[proc]; }
 
-    v_memw_t<Graph_t> minMemoryBound() const { return *(std::min_element(memory_bound.begin(), memory_bound.end())); }
-    v_memw_t<Graph_t> maxMemoryBound() const { return *(std::max_element(memory_bound.begin(), memory_bound.end())); }
-    v_memw_t<Graph_t> sumMemoryBound() const { return std::accumulate(memory_bound.begin(), memory_bound.end(), 0); }
+    /**
+     * @brief Returns the maximum memory bound over all processors.
+     * @return The maximum memory bound.
+     */
+    [[nodiscard]] v_memw_t<Graph_t> maxMemoryBound() const { return *(std::max_element(memoryBound_.begin(), memoryBound_.end())); }
 
-    v_memw_t<Graph_t> maxMemoryBoundProcType(v_type_t<Graph_t> procType) const {
-        v_memw_t<Graph_t> max_mem = 0;
-        for (unsigned proc = 0; proc < number_processors; proc++) {
-            if (processor_type[proc] == procType) {
-                max_mem = std::max(max_mem, memory_bound[proc]);
+    /**
+     * @brief Returns the maximum memory bound over all processors of a specific type.
+     *
+     * @param procType The processor type.
+     * @return The maximum memory bound.
+     */
+    [[nodiscard]] v_memw_t<Graph_t> maxMemoryBoundProcType(const v_type_t<Graph_t> procType) const {
+        v_memw_t<Graph_t> max_mem = 0U;
+        for (unsigned proc = 0U; proc < numberOfProcessors_; proc++) {
+            if (processorTypes_[proc] == procType) {
+                max_mem = std::max(max_mem, memoryBound_[proc]);
             }
         }
         return max_mem;
     }
 
     /**
-     * Returns the number of processors in the architecture.
-     *
+     * @brief Returns the number of processors.
      * @return The number of processors.
      */
-    inline unsigned numberOfProcessors() const { return number_processors; }
+    [[nodiscard]] unsigned numberOfProcessors() const { return numberOfProcessors_; }
 
     /**
-     * Returns the communication costs of the BSP architecture.
-     *
-     * @return The communication costs as an unsigned integer.
+     * @brief Returns the communication costs.
+     * @return The communication costs.
      */
-    inline v_commw_t<Graph_t> communicationCosts() const { return communication_costs; }
+    [[nodiscard]] v_commw_t<Graph_t> communicationCosts() const { return communicationCosts_; }
 
     /**
-     * Returns the synchronization costs of the BspArchitecture.
-     *
-     * @return The synchronization costs as an unsigned integer.
+     * @brief Returns the synchronization costs.
+     * @return The synchronization costs.
      */
-    inline v_commw_t<Graph_t> synchronisationCosts() const { return synchronisation_costs; }
+    [[nodiscard]] v_commw_t<Graph_t> synchronisationCosts() const { return synchronisationCosts_; }
 
     /**
-     * Returns a copy of the send costs matrix.
-     *
-     * @return A copy of the send costs matrix.
+     * @brief Returns a the send costs matrix. Internally the matrix is stored as a flattened matrix. The allocates, computes and returns the matrix on the fly.
+     * @return The send costs matrix.
      */
-    inline std::vector<std::vector<v_commw_t<Graph_t>>> sendCostMatrixCopy() const { return send_costs; }
+    [[nodiscard]] std::vector<std::vector<v_commw_t<Graph_t>>> sendCost() const {
+        std::vector<std::vector<v_commw_t<Graph_t>>> matrix(numberOfProcessors_, std::vector<v_commw_t<Graph_t>>(numberOfProcessors_));
+        for (unsigned i = 0; i < numberOfProcessors_; ++i) {
+            for (unsigned j = 0; j < numberOfProcessors_; ++j) {
+                matrix[i][j] = sendCosts_[FlatIndex(i, j)];
+            }
+        }
+        return matrix;
+    }
 
     /**
-     * Returns a reference to the send costs matrix.
-     *
-     * @return A reference to the send costs matrix.
+     * @brief Returns the flattened send costs vector.
+     * @return The send costs vector.
      */
-    inline const std::vector<std::vector<v_commw_t<Graph_t>>> &sendCostMatrix() const { return send_costs; }
+    [[nodiscard]] const std::vector<v_commw_t<Graph_t>> &sendCostsVector() const { return sendCosts_; }
 
-    // the type indeces of the processor (e.g. CPU, vector/tensor core)
-    inline const std::vector<unsigned> &processorTypes() const { return processor_type; }
+    /**
+     * @brief Returns the processor types.
+     * @return Vector of processor types.
+     */
+    [[nodiscard]] const std::vector<unsigned> &processorTypes() const { return processorTypes_; }
 
     /**
-     * Returns the communication costs between two processors. The communication costs are the send costs multiplied by
-     * the communication costs.
+     * @brief Returns the communication costs between two processors. Does not perform bounds checking.
+     * The communication costs are the send costs multiplied by the communication costs factor.
      *
      * @param p1 The index of the first processor.
      * @param p2 The index of the second processor.
-     *
-     * @return The send costs between the two processors.
+     * @return The communication costs between the two processors.
      */
-    inline v_commw_t<Graph_t> communicationCosts(unsigned p1, unsigned p2) const {
-        return communication_costs * send_costs[p1][p2];
+    [[nodiscard]] v_commw_t<Graph_t> communicationCosts(const unsigned p1, const unsigned p2) const {
+        return communicationCosts_ * sendCosts_[FlatIndex(p1, p2)];
     }
 
     /**
-     * Returns the send costs between two processors.
+     * @brief Returns the send costs between two processors. Does not perform bounds checking.
      *
      * @param p1 The index of the first processor.
      * @param p2 The index of the second processor.
-     *
      * @return The send costs between the two processors.
      */
-    inline v_commw_t<Graph_t> sendCosts(unsigned p1, unsigned p2) const { return send_costs[p1][p2]; }
-
-    inline auto sendCosts() const { return send_costs; }
-
-    // the type index of the processor (e.g. CPU, vector/tensor core)
-    inline v_type_t<Graph_t> processorType(unsigned p1) const { return processor_type[p1]; }
+    [[nodiscard]] v_commw_t<Graph_t> sendCosts(const unsigned p1, const unsigned p2) const { return sendCosts_[FlatIndex(p1, p2)]; }
 
-    void setProcessorType(unsigned p1, v_type_t<Graph_t> type) {
-
-        if (p1 >= number_processors)
-            throw std::invalid_argument("Invalid Argument");
+    /**
+     * @brief Returns the type of a specific processor. Does not perform bounds checking.
+     * @param p1 The processor index.
+     * @return The processor type.
+     */
+    [[nodiscard]] v_type_t<Graph_t> processorType(const unsigned p1) const { return processorTypes_[p1]; }
 
-        processor_type[p1] = type;
-        number_of_processor_types = std::max(number_of_processor_types, type + 1u);
+    /**
+     * @brief Sets the type of a specific processor. Performs bounds checking.
+     * @param p1 The processor index.
+     * @param type The new processor type.
+     */
+    void setProcessorType(const unsigned p1, const v_type_t<Graph_t> type) {
+        processorTypes_.at(p1) = type;
+        numberOfProcessorTypes_ = std::max(numberOfProcessorTypes_, type + 1U);
     }
 
-    std::vector<unsigned> getProcessorTypeCount() const {
-
-        std::vector<unsigned> type_count(number_of_processor_types, 0u);
-        for (unsigned p = 0u; p < number_processors; p++) {
-            type_count[processor_type[p]]++;
+    /**
+     * @brief Returns the count of processors for each type.
+     * @return Vector where index is type and value is count.
+     */
+    [[nodiscard]] std::vector<unsigned> getProcessorTypeCount() const {
+        std::vector<unsigned> type_count(numberOfProcessorTypes_, 0U);
+        for (unsigned p = 0U; p < numberOfProcessors_; p++) {
+            type_count[processorTypes_[p]]++;
         }
         return type_count;
     }
 
-    unsigned getMinProcessorTypeCount() const {
-        const auto &type_count = getProcessorTypeCount();
-        if (type_count.empty()) {
-            return 0;
-        }
-        return *std::min_element(type_count.begin(), type_count.end());
-    }
-
-    void print_architecture(std::ostream &os) const {
-
-        os << "Architectur info:  number of processors: " << number_processors
-           << ", Number of processor types: " << number_of_processor_types
-           << ", Communication costs: " << communication_costs << ", Synchronization costs: " << synchronisation_costs
-           << std::endl;
+    /**
+     * @brief Prints the architecture details to the output stream.
+     * @param os The output stream.
+     */
+    void print(std::ostream &os) const {
+        os << "Architecture info:  number of processors: " << numberOfProcessors_
+           << ", Number of processor types: " << numberOfProcessorTypes_
+           << ", Communication costs: " << communicationCosts_ << ", Synchronization costs: " << synchronisationCosts_
+           << "\n";
         os << std::setw(17) << " Processor: ";
-        for (unsigned i = 0; i < number_processors; i++) {
+        for (unsigned i = 0U; i < numberOfProcessors_; i++) {
             os << std::right << std::setw(5) << i << " ";
         }
-        os << std::endl;
+        os << "\n";
         os << std::setw(17) << "Processor type: ";
-        for (unsigned i = 0; i < number_processors; i++) {
-            os << std::right << std::setw(5) << processor_type[i] << " ";
+        for (unsigned i = 0U; i < numberOfProcessors_; i++) {
+            os << std::right << std::setw(5) << processorTypes_.at(i) << " ";
         }
-        os << std::endl;
+        os << "\n";
         os << std::setw(17) << "Memory bound: ";
-        for (unsigned i = 0; i < number_processors; i++) {
-            os << std::right << std::setw(5) << memory_bound[i] << " ";
-        }
-        os << std::endl;
-    }
-
-    void updateNumberOfProcessorTypes() {
-        number_of_processor_types = 0;
-        for (unsigned p = 0; p < number_processors; p++) {
-            if (processor_type[p] >= number_of_processor_types) {
-                number_of_processor_types = processor_type[p] + 1;
-            }
-        }
-    }
-
-    std::vector<std::vector<unsigned>> getProcessorIdsByType() const {
-        std::vector<std::vector<unsigned>> processor_ids_by_type(number_of_processor_types);
-        for (unsigned i = 0; i < numberOfProcessors(); ++i) {
-            processor_ids_by_type[processorType(i)].push_back(i);
+        for (unsigned i = 0U; i < numberOfProcessors_; i++) {
+            os << std::right << std::setw(5) << memoryBound_.at(i) << " ";
         }
-        return processor_ids_by_type;
+        os << "\n";
     }
 
-    inline unsigned getNumberOfProcessorTypes() const { return number_of_processor_types; };
+    [[nodiscard]] unsigned getNumberOfProcessorTypes() const { return numberOfProcessorTypes_; };
 
-    inline MEMORY_CONSTRAINT_TYPE getMemoryConstraintType() const { return memory_const_type; }
-    inline void setMemoryConstraintType(MEMORY_CONSTRAINT_TYPE memory_const_type_) {
-        memory_const_type = memory_const_type_;
+    [[nodiscard]] MEMORY_CONSTRAINT_TYPE getMemoryConstraintType() const { return memoryConstraintType_; }
+    void setMemoryConstraintType(const MEMORY_CONSTRAINT_TYPE memoryConstraintType) {
+        memoryConstraintType_ = memoryConstraintType;
     }
 };
 
diff --git a/include/osp/bsp/model/BspInstance.hpp b/include/osp/bsp/model/BspInstance.hpp
index 4e31d145..7ab72fd4 100644
--- a/include/osp/bsp/model/BspInstance.hpp
+++ b/include/osp/bsp/model/BspInstance.hpp
@@ -27,7 +27,10 @@ limitations under the License.
 
 namespace osp {
 
-enum class RETURN_STATUS { OSP_SUCCESS, BEST_FOUND, TIMEOUT, ERROR };
+enum class RETURN_STATUS { OSP_SUCCESS,
+                           BEST_FOUND,
+                           TIMEOUT,
+                           ERROR };
 
 inline std::string to_string(const RETURN_STATUS status) {
     switch (status) {
@@ -44,13 +47,23 @@ inline std::string to_string(const RETURN_STATUS status) {
     }
 }
 
-inline std::ostream& operator<<(std::ostream& os, RETURN_STATUS status) {
+inline std::ostream &operator<<(std::ostream &os, RETURN_STATUS status) {
     switch (status) {
-        case RETURN_STATUS::OSP_SUCCESS:        os << "SUCCESS";        break;
-        case RETURN_STATUS::BEST_FOUND:     os << "BEST_FOUND";     break;
-        case RETURN_STATUS::TIMEOUT:        os << "TIMEOUT";        break;
-        case RETURN_STATUS::ERROR:          os << "ERROR";          break;
-        default:                            os << "UNKNOWN";        break; 
+    case RETURN_STATUS::OSP_SUCCESS:
+        os << "SUCCESS";
+        break;
+    case RETURN_STATUS::BEST_FOUND:
+        os << "BEST_FOUND";
+        break;
+    case RETURN_STATUS::TIMEOUT:
+        os << "TIMEOUT";
+        break;
+    case RETURN_STATUS::ERROR:
+        os << "ERROR";
+        break;
+    default:
+        os << "UNKNOWN";
+        break;
     }
     return os;
 }
@@ -192,13 +205,21 @@ class BspInstance {
 
     /**
      * @brief Returns a copy of the send costs matrix.
-     *
      * @return A copy of the send costs matrix.
      */
-    inline const std::vector<std::vector<v_commw_t<Graph_t>>> &sendCostMatrix() const {
+    inline std::vector<std::vector<v_commw_t<Graph_t>>> sendCostMatrix() const {
         return architecture.sendCostMatrix();
     }
 
+    /**
+     * @brief Returns the flattened send costs vector.
+     *
+     * @return The flattened send costs vector.
+     */
+    inline const std::vector<v_commw_t<Graph_t>> &sendCostsVector() const {
+        return architecture.sendCostsVector();
+    }
+
     /**
      * @brief Returns the communication costs of the BSP architecture.
      *
@@ -389,53 +410,48 @@ class BspInstance {
 };
 
 template<typename Graph_t>
-class compatible_processor_range {
+class CompatibleProcessorRange {
 
     std::vector<std::vector<unsigned>> type_processor_idx;
     const BspInstance<Graph_t> *instance = nullptr;
 
-    public:
+  public:
+    CompatibleProcessorRange() = default;
 
-    compatible_processor_range() = default;
-    
-    compatible_processor_range(const BspInstance<Graph_t> &inst) {
+    CompatibleProcessorRange(const BspInstance<Graph_t> &inst) {
         initialize(inst);
     }
-    
+
     inline void initialize(const BspInstance<Graph_t> &inst) {
 
         instance = &inst;
 
-        if constexpr (has_typed_vertices_v<Graph_t>) {                
-         
+        if constexpr (has_typed_vertices_v<Graph_t>) {
+
             type_processor_idx = std::vector<std::vector<unsigned>>(inst.getComputationalDag().num_vertex_types());
 
             for (v_type_t<Graph_t> v_type = 0; v_type < inst.getComputationalDag().num_vertex_types(); v_type++) {
-                for (unsigned proc = 0; proc < inst.numberOfProcessors(); proc++) 
-                    if (inst.isCompatibleType(v_type, inst.processorType(proc))) 
-                        type_processor_idx[v_type].push_back(proc);                     
-                
+                for (unsigned proc = 0; proc < inst.numberOfProcessors(); proc++)
+                    if (inst.isCompatibleType(v_type, inst.processorType(proc)))
+                        type_processor_idx[v_type].push_back(proc);
             }
-        } 
+        }
     }
 
-    inline const auto & compatible_processors_type(v_type_t<Graph_t> type) const {
+    inline const auto &compatible_processors_type(v_type_t<Graph_t> type) const {
 
         assert(instance != nullptr);
 
         if constexpr (has_typed_vertices_v<Graph_t>) {
-            return type_processor_idx[type];                       
+            return type_processor_idx[type];
         } else {
             return instance->processors();
         }
     }
 
-    inline const auto & compatible_processors_vertex(vertex_idx_t<Graph_t> vertex) const {
+    inline const auto &compatible_processors_vertex(vertex_idx_t<Graph_t> vertex) const {
         return compatible_processors_type(instance->getComputationalDag().vertex_type(vertex));
     }
-
-
 };
 
-
 } // namespace osp
\ No newline at end of file
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp
index 38fae9ff..b5b4ea95 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp
@@ -39,6 +39,8 @@ limitations under the License.
 
 namespace osp {
 
+static constexpr unsigned CacheLineSize = 64;
+
 template<typename vert_t, typename weight_t>
 struct GrowLocalAutoCoresParallel_Params {
     vert_t minSuperstepSize = 20;
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp
index f6c425bd..2cf0c631 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_bsp_comm_cost.hpp
@@ -97,7 +97,7 @@ struct kl_bsp_comm_cost_function {
     constexpr static bool is_max_comm_cost_function = true;
 
     kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> *active_schedule;
-    compatible_processor_range<Graph_t> *proc_range;
+    CompatibleProcessorRange<Graph_t> *proc_range;
     const Graph_t *graph;
     const BspInstance<Graph_t> *instance;
 
@@ -119,7 +119,7 @@ struct kl_bsp_comm_cost_function {
     }
 
     void initialize(kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> &sched,
-                    compatible_processor_range<Graph_t> &p_range) {
+                    CompatibleProcessorRange<Graph_t> &p_range) {
         active_schedule = &sched;
         proc_range = &p_range;
         instance = &sched.getInstance();
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp
index 50384c72..caaad9ca 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_hyper_total_comm_cost.hpp
@@ -24,24 +24,24 @@ limitations under the License.
 
 namespace osp {
 
-template<typename Graph_t, typename cost_t, typename MemoryConstraint_t, unsigned window_size = 1> 
+template<typename Graph_t, typename cost_t, typename MemoryConstraint_t, unsigned window_size = 1>
 struct kl_hyper_total_comm_cost_function {
-    
+
     using VertexType = vertex_idx_t<Graph_t>;
     using kl_move = kl_move_struct<cost_t, VertexType>;
     using kl_gain_update_info = kl_update_info<VertexType>;
-   
+
     constexpr static unsigned window_range = 2 * window_size + 1;
     constexpr static bool is_max_comm_cost_function = false;
 
     kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> *active_schedule;
 
-    compatible_processor_range<Graph_t> *proc_range;
+    CompatibleProcessorRange<Graph_t> *proc_range;
 
     const Graph_t *graph;
     const BspInstance<Graph_t> *instance;
 
-    cost_t comm_multiplier = 1; 
+    cost_t comm_multiplier = 1;
     cost_t max_comm_weight = 0;
 
     lambda_vector_container<VertexType> node_lambda_map;
@@ -52,20 +52,20 @@ struct kl_hyper_total_comm_cost_function {
     const std::string name() const { return "toal_comm_cost"; }
     inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); }
 
-    void initialize(kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> &sched, compatible_processor_range<Graph_t> &p_range) {
+    void initialize(kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> &sched, CompatibleProcessorRange<Graph_t> &p_range) {
         active_schedule = &sched;
         proc_range = &p_range;
         instance = &sched.getInstance();
         graph = &instance->getComputationalDag();
-        comm_multiplier = 1.0 / instance->numberOfProcessors();  
-        node_lambda_map.initialize(graph->num_vertices(), instance->numberOfProcessors());      
+        comm_multiplier = 1.0 / instance->numberOfProcessors();
+        node_lambda_map.initialize(graph->num_vertices(), instance->numberOfProcessors());
     }
 
     struct empty_struct {};
 
     using pre_move_comm_data_t = empty_struct;
 
-    inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); }
+    inline empty_struct get_pre_move_comm_data(const kl_move &) { return empty_struct(); }
 
     cost_t compute_schedule_cost() {
         cost_t work_costs = 0;
@@ -74,7 +74,7 @@ struct kl_hyper_total_comm_cost_function {
         }
 
         cost_t comm_costs = 0;
-        for(const auto vertex : graph->vertices()) {
+        for (const auto vertex : graph->vertices()) {
             const unsigned vertex_proc = active_schedule->assigned_processor(vertex);
             const cost_t v_comm_cost = graph->vertex_comm_weight(vertex);
             max_comm_weight = std::max(max_comm_weight, v_comm_cost);
@@ -87,7 +87,7 @@ struct kl_hyper_total_comm_cost_function {
                 if (node_lambda_map.increase_proc_count(vertex, target_proc)) {
                     comm_costs += v_comm_cost * instance->communicationCosts(vertex_proc, target_proc); // is 0 if target_proc == vertex_proc
                 }
-            } 
+            }
         }
 
         return work_costs + comm_costs * comm_multiplier + static_cast<v_commw_t<Graph_t>>(active_schedule->num_steps() - 1) * instance->synchronisationCosts();
@@ -100,132 +100,132 @@ struct kl_hyper_total_comm_cost_function {
         }
 
         cost_t comm_costs = 0;
-        for(const auto vertex : graph->vertices()) {
+        for (const auto vertex : graph->vertices()) {
             const unsigned vertex_proc = active_schedule->assigned_processor(vertex);
             const cost_t v_comm_cost = graph->vertex_comm_weight(vertex);
             for (const auto lambdaproc_mult_pair : node_lambda_map.iterate_proc_entries(vertex)) {
                 const auto &lambda_proc = lambdaproc_mult_pair.first;
                 comm_costs += v_comm_cost * instance->communicationCosts(vertex_proc, lambda_proc);
-            } 
+            }
         }
 
         return work_costs + comm_costs * comm_multiplier + static_cast<v_commw_t<Graph_t>>(active_schedule->num_steps() - 1) * instance->synchronisationCosts();
     }
 
-    inline void update_datastructure_after_move(const kl_move & move, const unsigned start_step, const unsigned end_step) {
-        if (move.to_proc != move.from_proc) {  
+    inline void update_datastructure_after_move(const kl_move &move, const unsigned start_step, const unsigned end_step) {
+        if (move.to_proc != move.from_proc) {
             for (const auto &source : instance->getComputationalDag().parents(move.node)) {
                 const unsigned source_step = active_schedule->assigned_superstep(source);
                 if (source_step < start_step || source_step > end_step)
                     continue;
-                update_source_after_move(move, source);    
+                update_source_after_move(move, source);
             }
         }
     }
 
-    inline void update_source_after_move(const kl_move & move, VertexType source) {
+    inline void update_source_after_move(const kl_move &move, VertexType source) {
         node_lambda_map.decrease_proc_count(source, move.from_proc);
         node_lambda_map.increase_proc_count(source, move.to_proc);
     }
 
     template<typename thread_data_t>
-    void update_node_comm_affinity(const kl_move &move, thread_data_t& thread_data, const cost_t& penalty, const cost_t& reward, std::map<VertexType, kl_gain_update_info> & max_gain_recompute, std::vector<VertexType> &new_nodes) {
-                
+    void update_node_comm_affinity(const kl_move &move, thread_data_t &thread_data, const cost_t &penalty, const cost_t &reward, std::map<VertexType, kl_gain_update_info> &max_gain_recompute, std::vector<VertexType> &new_nodes) {
+
         const unsigned start_step = thread_data.start_step;
         const unsigned end_step = thread_data.end_step;
-                     
+
         for (const auto &target : instance->getComputationalDag().children(move.node)) {
-            const unsigned target_step = active_schedule->assigned_superstep(target); 
+            const unsigned target_step = active_schedule->assigned_superstep(target);
             if (target_step < start_step || target_step > end_step)
                 continue;
 
-            if(thread_data.lock_manager.is_locked(target))
+            if (thread_data.lock_manager.is_locked(target))
                 continue;
 
             if (not thread_data.affinity_table.is_selected(target)) {
-                new_nodes.push_back(target);  
+                new_nodes.push_back(target);
                 continue;
             }
 
             if (max_gain_recompute.find(target) != max_gain_recompute.end()) {
-                max_gain_recompute[target].full_update = true;                
+                max_gain_recompute[target].full_update = true;
             } else {
                 max_gain_recompute[target] = kl_gain_update_info(target, true);
-            }           
+            }
 
             const unsigned target_proc = active_schedule->assigned_processor(target);
-            const unsigned target_start_idx = start_idx(target_step, start_step);            
+            const unsigned target_start_idx = start_idx(target_step, start_step);
             auto &affinity_table = thread_data.affinity_table.at(target);
 
             if (move.from_step < target_step + (move.from_proc == target_proc)) {
-                const unsigned diff = target_step - move.from_step;                
-                const unsigned bound = window_size >= diff ? window_size - diff + 1: 0;  
-                unsigned idx = target_start_idx; 
+                const unsigned diff = target_step - move.from_step;
+                const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0;
+                unsigned idx = target_start_idx;
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                         affinity_table[p][idx] -= penalty;
-                    }                                                
-                } 
+                    }
+                }
 
                 if (idx - 1 < bound && is_compatible(target, move.from_proc)) {
-                    affinity_table[move.from_proc][idx - 1] += penalty;    
+                    affinity_table[move.from_proc][idx - 1] += penalty;
                 }
 
             } else {
                 const unsigned diff = move.from_step - target_step;
-                const unsigned window_bound = end_idx(target_step, end_step);  
-                unsigned idx = std::min(window_size + diff, window_bound);                  
-                
-                if (idx < window_bound && is_compatible(target, move.from_proc)) { 
-                    affinity_table[move.from_proc][idx] += reward; 
+                const unsigned window_bound = end_idx(target_step, end_step);
+                unsigned idx = std::min(window_size + diff, window_bound);
+
+                if (idx < window_bound && is_compatible(target, move.from_proc)) {
+                    affinity_table[move.from_proc][idx] += reward;
                 }
 
                 idx++;
-                
+
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                         affinity_table[p][idx] += reward;
-                    }                        
-                } 
+                    }
+                }
             }
 
             if (move.to_step < target_step + (move.to_proc == target_proc)) {
-                unsigned idx = target_start_idx; 
-                const unsigned diff = target_step - move.to_step;                
-                const unsigned bound = window_size >= diff ? window_size - diff + 1: 0;  
+                unsigned idx = target_start_idx;
+                const unsigned diff = target_step - move.to_step;
+                const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0;
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                         affinity_table[p][idx] += penalty;
-                    }                                                
-                } 
+                    }
+                }
 
                 if (idx - 1 < bound && is_compatible(target, move.to_proc)) {
-                    affinity_table[move.to_proc][idx - 1] -= penalty;    
+                    affinity_table[move.to_proc][idx - 1] -= penalty;
                 }
 
             } else {
                 const unsigned diff = move.to_step - target_step;
-                const unsigned window_bound = end_idx(target_step, end_step); 
-                unsigned idx = std::min(window_size + diff, window_bound);                                                     
-                
+                const unsigned window_bound = end_idx(target_step, end_step);
+                unsigned idx = std::min(window_size + diff, window_bound);
+
                 if (idx < window_bound && is_compatible(target, move.to_proc)) {
-                    affinity_table[move.to_proc][idx] -= reward; 
+                    affinity_table[move.to_proc][idx] -= reward;
                 }
 
                 idx++;
-                                    
+
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                         affinity_table[p][idx] -= reward;
-                    }                        
-                } 
+                    }
+                }
             }
 
-            if (move.to_proc != move.from_proc) {                  
+            if (move.to_proc != move.from_proc) {
                 const cost_t comm_gain = graph->vertex_comm_weight(move.node) * comm_multiplier;
-                
+
                 const unsigned window_bound = end_idx(target_step, end_step);
-                for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
+                for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                     if (p == target_proc)
                         continue;
                     if (node_lambda_map.get_proc_entry(move.node, target_proc) == 1) {
@@ -233,144 +233,143 @@ struct kl_hyper_total_comm_cost_function {
                             const cost_t x = instance->communicationCosts(move.from_proc, target_proc) * comm_gain;
                             const cost_t y = instance->communicationCosts(move.to_proc, target_proc) * comm_gain;
                             affinity_table[p][idx] += x - y;
-                        } 
+                        }
                     }
 
                     if (node_lambda_map.has_no_proc_entry(move.node, p)) {
                         for (unsigned idx = target_start_idx; idx < window_bound; idx++) {
                             const cost_t x = instance->communicationCosts(move.from_proc, p) * comm_gain;
                             const cost_t y = instance->communicationCosts(move.to_proc, p) * comm_gain;
-                            affinity_table[p][idx] -= x - y;                        
+                            affinity_table[p][idx] -= x - y;
                         }
-                    }  
+                    }
                 }
-            } 
+            }
         }
 
-        for (const auto &source : instance->getComputationalDag().parents(move.node)) {            
+        for (const auto &source : instance->getComputationalDag().parents(move.node)) {
 
             if (move.to_proc != move.from_proc) {
-                const unsigned source_proc = active_schedule->assigned_processor(source);   
-                if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) {                    
+                const unsigned source_proc = active_schedule->assigned_processor(source);
+                if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) {
                     const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier;
 
                     for (const auto &target : instance->getComputationalDag().children(source)) {
                         const unsigned target_step = active_schedule->assigned_superstep(target);
-                        if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target))  
-                            continue;  
+                        if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target))
+                            continue;
 
-                        if (source_proc != move.from_proc && is_compatible(target, move.from_proc)) { 
+                        if (source_proc != move.from_proc && is_compatible(target, move.from_proc)) {
                             if (max_gain_recompute.find(target) != max_gain_recompute.end()) { // todo more specialized update
-                                max_gain_recompute[target].full_update = true;                
+                                max_gain_recompute[target].full_update = true;
                             } else {
                                 max_gain_recompute[target] = kl_gain_update_info(target, true);
-                            }    
+                            }
 
-                            auto & affinity_table_target_from_proc = thread_data.affinity_table.at(target)[move.from_proc];
+                            auto &affinity_table_target_from_proc = thread_data.affinity_table.at(target)[move.from_proc];
                             const unsigned target_window_bound = end_idx(target_step, end_step);
                             const cost_t comm_aff = instance->communicationCosts(source_proc, move.from_proc) * comm_gain;
                             for (unsigned idx = start_idx(target_step, start_step); idx < target_window_bound; idx++) {
                                 affinity_table_target_from_proc[idx] += comm_aff;
                             }
                         }
-                    }                    
-                } else if (node_lambda_map.get_proc_entry(source, move.from_proc) == 1)  {
+                    }
+                } else if (node_lambda_map.get_proc_entry(source, move.from_proc) == 1) {
                     const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier;
 
                     for (const auto &target : instance->getComputationalDag().children(source)) {
                         const unsigned target_step = active_schedule->assigned_superstep(target);
-                        if ((target_step < start_step || target_step > end_step) || (target == move.node) || thread_data.lock_manager.is_locked(target) || (not thread_data.affinity_table.is_selected(target)))  
-                            continue;   
+                        if ((target_step < start_step || target_step > end_step) || (target == move.node) || thread_data.lock_manager.is_locked(target) || (not thread_data.affinity_table.is_selected(target)))
+                            continue;
 
                         const unsigned target_proc = active_schedule->assigned_processor(target);
-                        if (target_proc == move.from_proc) {      
+                        if (target_proc == move.from_proc) {
                             if (max_gain_recompute.find(target) != max_gain_recompute.end()) { // todo more specialized update
-                                max_gain_recompute[target].full_update = true;                
+                                max_gain_recompute[target].full_update = true;
                             } else {
                                 max_gain_recompute[target] = kl_gain_update_info(target, true);
-                            } 
-                            
+                            }
+
                             const unsigned target_start_idx = start_idx(target_step, start_step);
                             const unsigned target_window_bound = end_idx(target_step, end_step);
-                            auto & affinity_table_target = thread_data.affinity_table.at(target);
+                            auto &affinity_table_target = thread_data.affinity_table.at(target);
                             const cost_t comm_aff = instance->communicationCosts(source_proc, target_proc) * comm_gain;
                             for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                                 if (p == target_proc)
-                                    continue;      
-                                
+                                    continue;
+
                                 for (unsigned idx = target_start_idx; idx < target_window_bound; idx++) {
                                     affinity_table_target[p][idx] -= comm_aff;
-                                } 
+                                }
                             }
                             break; // since node_lambda_map[source][move.from_proc] == 1
-                        }   
-                    }                    
+                        }
+                    }
                 }
 
                 if (node_lambda_map.get_proc_entry(source, move.to_proc) == 1) {
                     const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier;
-                    
+
                     for (const auto &target : instance->getComputationalDag().children(source)) {
                         const unsigned target_step = active_schedule->assigned_superstep(target);
-                        if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target))  
-                            continue;   
-                        
+                        if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target))
+                            continue;
+
                         if (source_proc != move.to_proc && is_compatible(target, move.to_proc)) {
                             if (max_gain_recompute.find(target) != max_gain_recompute.end()) {
-                                max_gain_recompute[target].full_update = true;                
+                                max_gain_recompute[target].full_update = true;
                             } else {
                                 max_gain_recompute[target] = kl_gain_update_info(target, true);
-                            } 
-                            
+                            }
+
                             const unsigned target_window_bound = end_idx(target_step, end_step);
-                            auto & affinity_table_target_to_proc = thread_data.affinity_table.at(target)[move.to_proc];
+                            auto &affinity_table_target_to_proc = thread_data.affinity_table.at(target)[move.to_proc];
                             const cost_t comm_aff = instance->communicationCosts(source_proc, move.to_proc) * comm_gain;
                             for (unsigned idx = start_idx(target_step, start_step); idx < target_window_bound; idx++) {
                                 affinity_table_target_to_proc[idx] -= comm_aff;
-                            }                              
+                            }
                         }
                     }
-                } else if (node_lambda_map.get_proc_entry(source, move.to_proc) == 2) {  
+                } else if (node_lambda_map.get_proc_entry(source, move.to_proc) == 2) {
                     for (const auto &target : instance->getComputationalDag().children(source)) {
                         const unsigned target_step = active_schedule->assigned_superstep(target);
-                        if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target))  
-                            continue; 
-                        
+                        if ((target_step < start_step || target_step > end_step) || (target == move.node) || (not thread_data.affinity_table.is_selected(target)) || thread_data.lock_manager.is_locked(target))
+                            continue;
+
                         const unsigned target_proc = active_schedule->assigned_processor(target);
                         if (target_proc == move.to_proc) {
                             if (source_proc != target_proc) {
                                 if (max_gain_recompute.find(target) != max_gain_recompute.end()) {
-                                    max_gain_recompute[target].full_update = true;                
+                                    max_gain_recompute[target].full_update = true;
                                 } else {
                                     max_gain_recompute[target] = kl_gain_update_info(target, true);
-                                } 
-                               
+                                }
+
                                 const unsigned target_start_idx = start_idx(target_step, start_step);
                                 const unsigned target_window_bound = end_idx(target_step, end_step);
-                                auto & affinity_table_target = thread_data.affinity_table.at(target);
+                                auto &affinity_table_target = thread_data.affinity_table.at(target);
                                 const cost_t comm_aff = instance->communicationCosts(source_proc, target_proc) * graph->vertex_comm_weight(source) * comm_multiplier;
                                 for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                                     if (p == target_proc)
-                                        continue;      
-                                    
+                                        continue;
+
                                     for (unsigned idx = target_start_idx; idx < target_window_bound; idx++) {
                                         affinity_table_target[p][idx] += comm_aff;
-                                    }                                         
+                                    }
                                 }
                             }
                             break;
-                        }   
-                    }                  
-                }                
+                        }
+                    }
+                }
             }
 
-
-            const unsigned source_step = active_schedule->assigned_superstep(source); 
+            const unsigned source_step = active_schedule->assigned_superstep(source);
             if (source_step < start_step || source_step > end_step)
                 continue;
 
-            if(thread_data.lock_manager.is_locked(source)) 
-                continue;            
+            if (thread_data.lock_manager.is_locked(source))
+                continue;
 
             if (not thread_data.affinity_table.is_selected(source)) {
                 new_nodes.push_back(source);
@@ -378,111 +377,111 @@ struct kl_hyper_total_comm_cost_function {
             }
 
             if (max_gain_recompute.find(source) != max_gain_recompute.end()) {
-                max_gain_recompute[source].full_update = true;                
+                max_gain_recompute[source].full_update = true;
             } else {
                 max_gain_recompute[source] = kl_gain_update_info(source, true);
-            } 
+            }
 
-            const unsigned source_proc = active_schedule->assigned_processor(source);                            
+            const unsigned source_proc = active_schedule->assigned_processor(source);
             const unsigned source_start_idx = start_idx(source_step, start_step);
             const unsigned window_bound = end_idx(source_step, end_step);
-            auto & affinity_table_source = thread_data.affinity_table.at(source);
+            auto &affinity_table_source = thread_data.affinity_table.at(source);
 
             if (move.from_step < source_step + (move.from_proc != source_proc)) {
-                const unsigned diff = source_step - move.from_step; 
-                const unsigned bound = window_size > diff ? window_size - diff : 0; 
+                const unsigned diff = source_step - move.from_step;
+                const unsigned bound = window_size > diff ? window_size - diff : 0;
                 unsigned idx = source_start_idx;
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {  
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         affinity_table_source[p][idx] += reward;
-                    } 
+                    }
                 }
 
                 if (window_size >= diff && is_compatible(source, move.from_proc)) {
-                    affinity_table_source[move.from_proc][idx] += reward;    
+                    affinity_table_source[move.from_proc][idx] += reward;
                 }
 
-            } else {  
+            } else {
                 const unsigned diff = move.from_step - source_step;
-                unsigned idx = window_size + diff; 
-                
+                unsigned idx = window_size + diff;
+
                 if (idx < window_bound && is_compatible(source, move.from_proc)) {
-                    affinity_table_source[move.from_proc][idx] += penalty;                        
+                    affinity_table_source[move.from_proc][idx] += penalty;
                 }
 
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         affinity_table_source[p][idx] -= penalty;
-                    }                        
-                }                     
+                    }
+                }
             }
 
             if (move.to_step < source_step + (move.to_proc != source_proc)) {
-                const unsigned diff = source_step - move.to_step; 
-                const unsigned bound = window_size > diff ? window_size - diff : 0; 
+                const unsigned diff = source_step - move.to_step;
+                const unsigned bound = window_size > diff ? window_size - diff : 0;
                 unsigned idx = source_start_idx;
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {  
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         affinity_table_source[p][idx] -= reward;
-                    } 
+                    }
                 }
 
                 if (window_size >= diff && is_compatible(source, move.to_proc)) {
-                    affinity_table_source[move.to_proc][idx] -= reward;    
+                    affinity_table_source[move.to_proc][idx] -= reward;
                 }
 
-            } else { 
+            } else {
                 const unsigned diff = move.to_step - source_step;
-                unsigned idx = window_size + diff; 
+                unsigned idx = window_size + diff;
 
                 if (idx < window_bound && is_compatible(source, move.to_proc)) {
-                    affinity_table_source[move.to_proc][idx] -= penalty;                         
+                    affinity_table_source[move.to_proc][idx] -= penalty;
                 }
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         affinity_table_source[p][idx] += penalty;
-                    }                        
-                }                     
-            }  
-        
-            if (move.to_proc != move.from_proc) {   
-                if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) {                    
+                    }
+                }
+            }
+
+            if (move.to_proc != move.from_proc) {
+                if (node_lambda_map.has_no_proc_entry(source, move.from_proc)) {
                     const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier;
 
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {        
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         if (p == source_proc)
                             continue;
 
                         const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, move.from_proc), instance->communicationCosts(source_proc, move.from_proc), comm_gain);
                         for (unsigned idx = source_start_idx; idx < window_bound; idx++) {
                             affinity_table_source[p][idx] -= comm_cost;
-                        }                        
-                    }                  
-                } 
+                        }
+                    }
+                }
 
                 if (node_lambda_map.get_proc_entry(source, move.to_proc) == 1) {
                     const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier;
 
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {        
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         if (p == source_proc)
                             continue;
 
                         const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, move.to_proc), instance->communicationCosts(source_proc, move.to_proc), comm_gain);
                         for (unsigned idx = source_start_idx; idx < window_bound; idx++) {
                             affinity_table_source[p][idx] += comm_cost;
-                        }                 
+                        }
                     }
-                }                 
-            }                
-        }  
+                }
+            }
+        }
     }
 
     inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return node_step < window_size + start_step ? window_size - (node_step - start_step) : 0; }
-    inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return node_step + window_size <= end_step ? window_range : window_range - (node_step + window_size - end_step); }   
-    inline cost_t change_comm_cost(const v_commw_t<Graph_t> &p_target_comm_cost, const v_commw_t<Graph_t> &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0;}
+    inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return node_step + window_size <= end_step ? window_range : window_range - (node_step + window_size - end_step); }
+    inline cost_t change_comm_cost(const v_commw_t<Graph_t> &p_target_comm_cost, const v_commw_t<Graph_t> &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0; }
 
     template<typename affinity_table_t>
-    void compute_comm_affinity(VertexType node, affinity_table_t& affinity_table_node, const cost_t& penalty, const cost_t& reward, const unsigned start_step, const unsigned end_step) {
+    void compute_comm_affinity(VertexType node, affinity_table_t &affinity_table_node, const cost_t &penalty, const cost_t &reward, const unsigned start_step, const unsigned end_step) {
         const unsigned node_step = active_schedule->assigned_superstep(node);
         const unsigned node_proc = active_schedule->assigned_processor(node);
         const unsigned window_bound = end_idx(node_step, end_step);
@@ -490,42 +489,42 @@ struct kl_hyper_total_comm_cost_function {
 
         for (const auto &target : instance->getComputationalDag().children(node)) {
             const unsigned target_step = active_schedule->assigned_superstep(target);
-            const unsigned target_proc = active_schedule->assigned_processor(target); 
+            const unsigned target_proc = active_schedule->assigned_processor(target);
 
             if (target_step < node_step + (target_proc != node_proc)) {
-                const unsigned diff = node_step - target_step; 
-                const unsigned bound = window_size > diff ? window_size - diff : 0; 
+                const unsigned diff = node_step - target_step;
+                const unsigned bound = window_size > diff ? window_size - diff : 0;
                 unsigned idx = node_start_idx;
 
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                        
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
                         affinity_table_node[p][idx] -= reward;
-                    } 
+                    }
                 }
 
                 if (window_size >= diff && is_compatible(node, target_proc)) {
-                    affinity_table_node[target_proc][idx] -= reward;    
-                }  
+                    affinity_table_node[target_proc][idx] -= reward;
+                }
 
-            } else {  
+            } else {
                 const unsigned diff = target_step - node_step;
                 unsigned idx = window_size + diff;
 
                 if (idx < window_bound && is_compatible(node, target_proc)) {
-                    affinity_table_node[target_proc][idx] -= penalty; 
+                    affinity_table_node[target_proc][idx] -= penalty;
                 }
 
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                      
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
                         affinity_table_node[p][idx] += penalty;
-                    }                        
-                }                     
-            }    
+                    }
+                }
+            }
         } // traget
 
         const cost_t comm_gain = graph->vertex_comm_weight(node) * comm_multiplier;
 
-        for (const unsigned p : proc_range->compatible_processors_vertex(node)) {        
+        for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
             if (p == node_proc)
                 continue;
 
@@ -540,21 +539,21 @@ struct kl_hyper_total_comm_cost_function {
 
         for (const auto &source : instance->getComputationalDag().parents(node)) {
             const unsigned source_step = active_schedule->assigned_superstep(source);
-            const unsigned source_proc = active_schedule->assigned_processor(source);  
+            const unsigned source_proc = active_schedule->assigned_processor(source);
 
             if (source_step < node_step + (source_proc == node_proc)) {
-                const unsigned diff = node_step - source_step;                
-                const unsigned bound = window_size >= diff ? window_size - diff + 1: 0;  
+                const unsigned diff = node_step - source_step;
+                const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0;
                 unsigned idx = node_start_idx;
 
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                        
-                        affinity_table_node[p][idx] += penalty; 
-                    }                                                
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
+                        affinity_table_node[p][idx] += penalty;
+                    }
                 }
 
                 if (idx - 1 < bound && is_compatible(node, source_proc)) {
-                    affinity_table_node[source_proc][idx - 1] -= penalty;    
+                    affinity_table_node[source_proc][idx - 1] -= penalty;
                 }
 
             } else {
@@ -562,34 +561,34 @@ struct kl_hyper_total_comm_cost_function {
                 unsigned idx = std::min(window_size + diff, window_bound);
 
                 if (idx < window_bound && is_compatible(node, source_proc)) {
-                    affinity_table_node[source_proc][idx] -= reward;  
-                }    
+                    affinity_table_node[source_proc][idx] -= reward;
+                }
 
                 idx++;
 
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                        
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
                         affinity_table_node[p][idx] -= reward;
-                    }                        
-                } 
+                    }
+                }
             }
 
             const cost_t source_comm_gain = graph->vertex_comm_weight(source) * comm_multiplier;
-            for (const unsigned p : proc_range->compatible_processors_vertex(node)) { 
+            for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
                 if (p == node_proc)
                     continue;
 
                 if (source_proc != node_proc && node_lambda_map.get_proc_entry(source, node_proc) == 1) {
                     for (unsigned idx = node_start_idx; idx < window_bound; idx++) {
                         affinity_table_node[p][idx] -= instance->communicationCosts(source_proc, node_proc) * source_comm_gain;
-                    } 
+                    }
                 }
 
                 if (source_proc != p && node_lambda_map.has_no_proc_entry(source, p)) {
                     for (unsigned idx = node_start_idx; idx < window_bound; idx++) {
                         affinity_table_node[p][idx] += instance->communicationCosts(source_proc, p) * source_comm_gain;
                     }
-                }            
+                }
             }
         } // source
     }
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp
index be7c627c..5f471077 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_comm_cost.hpp
@@ -22,26 +22,26 @@ limitations under the License.
 #include "../kl_improver.hpp"
 
 namespace osp {
-template<typename Graph_t, typename cost_t, typename MemoryConstraint_t, unsigned window_size = 1, bool use_node_communication_costs_arg = true> 
+template<typename Graph_t, typename cost_t, typename MemoryConstraint_t, unsigned window_size = 1, bool use_node_communication_costs_arg = true>
 struct kl_total_comm_cost_function {
-    
+
     using VertexType = vertex_idx_t<Graph_t>;
     using kl_move = kl_move_struct<cost_t, VertexType>;
     using kl_gain_update_info = kl_update_info<VertexType>;
-    
+
     constexpr static bool is_max_comm_cost_function = false;
 
     constexpr static unsigned window_range = 2 * window_size + 1;
     constexpr static bool use_node_communication_costs = use_node_communication_costs_arg || not has_edge_weights_v<Graph_t>;
-     
+
     kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> *active_schedule;
 
-    compatible_processor_range<Graph_t> *proc_range;
+    CompatibleProcessorRange<Graph_t> *proc_range;
 
     const Graph_t *graph;
     const BspInstance<Graph_t> *instance;
 
-    cost_t comm_multiplier = 1; 
+    cost_t comm_multiplier = 1;
     cost_t max_comm_weight = 0;
 
     inline cost_t get_comm_multiplier() { return comm_multiplier; }
@@ -52,23 +52,23 @@ struct kl_total_comm_cost_function {
 
     inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); }
 
-    void initialize(kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> &sched, compatible_processor_range<Graph_t> &p_range) {
+    void initialize(kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> &sched, CompatibleProcessorRange<Graph_t> &p_range) {
         active_schedule = &sched;
         proc_range = &p_range;
         instance = &sched.getInstance();
         graph = &instance->getComputationalDag();
-        comm_multiplier = 1.0 / instance->numberOfProcessors();        
+        comm_multiplier = 1.0 / instance->numberOfProcessors();
     }
 
     struct empty_struct {};
     using pre_move_comm_data_t = empty_struct;
-    inline empty_struct get_pre_move_comm_data(const kl_move& ) { return empty_struct(); }
+    inline empty_struct get_pre_move_comm_data(const kl_move &) { return empty_struct(); }
 
     cost_t compute_schedule_cost_test() {
         return compute_schedule_cost();
     }
 
-    void update_datastructure_after_move(const kl_move&, const unsigned, const unsigned) {}
+    void update_datastructure_after_move(const kl_move &, const unsigned, const unsigned) {}
 
     cost_t compute_schedule_cost() {
 
@@ -89,7 +89,7 @@ struct kl_total_comm_cost_function {
             if (source_proc != target_proc) {
 
                 if constexpr (use_node_communication_costs) {
-                    const cost_t source_comm_cost = graph->vertex_comm_weight(source_v); 
+                    const cost_t source_comm_cost = graph->vertex_comm_weight(source_v);
                     max_comm_weight = std::max(max_comm_weight, source_comm_cost);
                     comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc);
                 } else {
@@ -98,108 +98,108 @@ struct kl_total_comm_cost_function {
                     comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc);
                 }
             }
-        }  
+        }
 
         return work_costs + comm_costs * comm_multiplier + static_cast<v_commw_t<Graph_t>>(active_schedule->num_steps() - 1) * instance->synchronisationCosts();
     }
 
     template<typename thread_data_t>
-    void update_node_comm_affinity(const kl_move &move, thread_data_t& thread_data, const cost_t& penalty, const cost_t& reward, std::map<VertexType, kl_gain_update_info> & max_gain_recompute, std::vector<VertexType> &new_nodes) {
-         
-        const unsigned & start_step = thread_data.start_step; 
-        const unsigned & end_step = thread_data.end_step;
+    void update_node_comm_affinity(const kl_move &move, thread_data_t &thread_data, const cost_t &penalty, const cost_t &reward, std::map<VertexType, kl_gain_update_info> &max_gain_recompute, std::vector<VertexType> &new_nodes) {
+
+        const unsigned &start_step = thread_data.start_step;
+        const unsigned &end_step = thread_data.end_step;
 
         for (const auto &target : instance->getComputationalDag().children(move.node)) {
 
-            const unsigned target_step = active_schedule->assigned_superstep(target); 
+            const unsigned target_step = active_schedule->assigned_superstep(target);
             if (target_step < start_step || target_step > end_step)
                 continue;
 
-            if(thread_data.lock_manager.is_locked(target))
+            if (thread_data.lock_manager.is_locked(target))
                 continue;
 
             if (not thread_data.affinity_table.is_selected(target)) {
-                new_nodes.push_back(target);  
+                new_nodes.push_back(target);
                 continue;
             }
 
             if (max_gain_recompute.find(target) != max_gain_recompute.end()) {
-                max_gain_recompute[target].full_update = true;                
+                max_gain_recompute[target].full_update = true;
             } else {
                 max_gain_recompute[target] = kl_gain_update_info(target, true);
-            }           
+            }
 
             const unsigned target_proc = active_schedule->assigned_processor(target);
-            const unsigned target_start_idx = start_idx(target_step, start_step);             
-            auto & affinity_table_target = thread_data.affinity_table.at(target);
+            const unsigned target_start_idx = start_idx(target_step, start_step);
+            auto &affinity_table_target = thread_data.affinity_table.at(target);
 
             if (move.from_step < target_step + (move.from_proc == target_proc)) {
 
-                const unsigned diff = target_step - move.from_step;                
-                const unsigned bound = window_size >= diff ? window_size - diff + 1: 0;  
-                unsigned idx = target_start_idx; 
+                const unsigned diff = target_step - move.from_step;
+                const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0;
+                unsigned idx = target_start_idx;
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                         affinity_table_target[p][idx] -= penalty;
-                    }                                                
-                } 
+                    }
+                }
 
                 if (idx - 1 < bound && is_compatible(target, move.from_proc)) {
-                    affinity_table_target[move.from_proc][idx - 1] += penalty;    
+                    affinity_table_target[move.from_proc][idx - 1] += penalty;
                 }
 
             } else {
 
                 const unsigned diff = move.from_step - target_step;
-                const unsigned window_bound = end_idx(target_step, end_step);  
-                unsigned idx = std::min(window_size + diff, window_bound);                  
-                
-                if (idx < window_bound && is_compatible(target, move.from_proc)) { 
-                    affinity_table_target[move.from_proc][idx] += reward; 
+                const unsigned window_bound = end_idx(target_step, end_step);
+                unsigned idx = std::min(window_size + diff, window_bound);
+
+                if (idx < window_bound && is_compatible(target, move.from_proc)) {
+                    affinity_table_target[move.from_proc][idx] += reward;
                 }
 
                 idx++;
-                
+
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                         affinity_table_target[p][idx] += reward;
-                    }                        
-                } 
+                    }
+                }
             }
 
             if (move.to_step < target_step + (move.to_proc == target_proc)) {
-                unsigned idx = target_start_idx; 
-                const unsigned diff = target_step - move.to_step;                
-                const unsigned bound = window_size >= diff ? window_size - diff + 1: 0;  
+                unsigned idx = target_start_idx;
+                const unsigned diff = target_step - move.to_step;
+                const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0;
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                         affinity_table_target[p][idx] += penalty;
-                    }                                                
-                } 
+                    }
+                }
 
                 if (idx - 1 < bound && is_compatible(target, move.to_proc)) {
-                    affinity_table_target[move.to_proc][idx - 1] -= penalty;    
+                    affinity_table_target[move.to_proc][idx - 1] -= penalty;
                 }
 
             } else {
                 const unsigned diff = move.to_step - target_step;
-                const unsigned window_bound = end_idx(target_step, end_step); 
-                unsigned idx = std::min(window_size + diff, window_bound);                                                     
-                
+                const unsigned window_bound = end_idx(target_step, end_step);
+                unsigned idx = std::min(window_size + diff, window_bound);
+
                 if (idx < window_bound && is_compatible(target, move.to_proc)) {
-                    affinity_table_target[move.to_proc][idx] -= reward; 
+                    affinity_table_target[move.to_proc][idx] -= reward;
                 }
 
                 idx++;
-                                    
+
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
                         affinity_table_target[p][idx] -= reward;
-                    }                        
-                } 
+                    }
+                }
             }
-        
-            if (move.to_proc != move.from_proc) {                
+
+            if (move.to_proc != move.from_proc) {
                 const auto from_proc_target_comm_cost = instance->communicationCosts(move.from_proc, target_proc);
                 const auto to_proc_target_comm_cost = instance->communicationCosts(move.to_proc, target_proc);
 
@@ -209,21 +209,21 @@ struct kl_total_comm_cost_function {
                 const unsigned window_bound = end_idx(target_step, end_step);
                 for (; idx < window_bound; idx++) {
                     for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
-                        const auto x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_target_comm_cost, comm_gain); 
+                        const auto x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_target_comm_cost, comm_gain);
                         const auto y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_target_comm_cost, comm_gain);
-                        affinity_table_target[p][idx] += x - y;  
+                        affinity_table_target[p][idx] += x - y;
                     }
                 }
-            } 
+            }
         }
 
         for (const auto &source : instance->getComputationalDag().parents(move.node)) {
 
-            const unsigned source_step = active_schedule->assigned_superstep(source); 
+            const unsigned source_step = active_schedule->assigned_superstep(source);
             if (source_step < start_step || source_step > end_step)
                 continue;
 
-            if(thread_data.lock_manager.is_locked(source))
+            if (thread_data.lock_manager.is_locked(source))
                 continue;
 
             if (not thread_data.affinity_table.is_selected(source)) {
@@ -232,75 +232,75 @@ struct kl_total_comm_cost_function {
             }
 
             if (max_gain_recompute.find(source) != max_gain_recompute.end()) {
-                max_gain_recompute[source].full_update = true;                
+                max_gain_recompute[source].full_update = true;
             } else {
                 max_gain_recompute[source] = kl_gain_update_info(source, true);
-            } 
+            }
 
             const unsigned source_proc = active_schedule->assigned_processor(source);
             const unsigned window_bound = end_idx(source_step, end_step);
-            auto & affinity_table_source = thread_data.affinity_table.at(source);
+            auto &affinity_table_source = thread_data.affinity_table.at(source);
 
             if (move.from_step < source_step + (move.from_proc != source_proc)) {
 
-                const unsigned diff = source_step - move.from_step; 
-                const unsigned bound = window_size > diff ? window_size - diff : 0; 
+                const unsigned diff = source_step - move.from_step;
+                const unsigned bound = window_size > diff ? window_size - diff : 0;
                 unsigned idx = start_idx(source_step, start_step);
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {  
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         affinity_table_source[p][idx] += reward;
-                    } 
+                    }
                 }
 
                 if (window_size >= diff && is_compatible(source, move.from_proc)) {
-                    affinity_table_source[move.from_proc][idx] += reward;    
+                    affinity_table_source[move.from_proc][idx] += reward;
                 }
 
-            } else {       
+            } else {
 
                 const unsigned diff = move.from_step - source_step;
-                unsigned idx = window_size + diff; 
-                
+                unsigned idx = window_size + diff;
+
                 if (idx < window_bound && is_compatible(source, move.from_proc)) {
-                    affinity_table_source[move.from_proc][idx] += penalty;                        
+                    affinity_table_source[move.from_proc][idx] += penalty;
                 }
 
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         affinity_table_source[p][idx] -= penalty;
-                    }                        
-                }                     
+                    }
+                }
             }
 
             if (move.to_step < source_step + (move.to_proc != source_proc)) {
-                const unsigned diff = source_step - move.to_step; 
-                const unsigned bound = window_size > diff ? window_size - diff : 0; 
+                const unsigned diff = source_step - move.to_step;
+                const unsigned bound = window_size > diff ? window_size - diff : 0;
                 unsigned idx = start_idx(source_step, start_step);
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {  
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         affinity_table_source[p][idx] -= reward;
-                    } 
+                    }
                 }
 
                 if (window_size >= diff && is_compatible(source, move.to_proc)) {
-                    affinity_table_source[move.to_proc][idx] -= reward;    
+                    affinity_table_source[move.to_proc][idx] -= reward;
                 }
 
-            } else {  
+            } else {
                 const unsigned diff = move.to_step - source_step;
-                unsigned idx = window_size + diff; 
+                unsigned idx = window_size + diff;
 
                 if (idx < window_bound && is_compatible(source, move.to_proc)) {
-                    affinity_table_source[move.to_proc][idx] -= penalty;                         
+                    affinity_table_source[move.to_proc][idx] -= penalty;
                 }
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) { 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
                         affinity_table_source[p][idx] += penalty;
-                    }                        
-                }                     
-            }  
+                    }
+                }
+            }
 
-            if (move.to_proc != move.from_proc) {                
+            if (move.to_proc != move.from_proc) {
                 const auto from_proc_source_comm_cost = instance->communicationCosts(source_proc, move.from_proc);
                 const auto to_proc_source_comm_cost = instance->communicationCosts(source_proc, move.to_proc);
 
@@ -308,23 +308,23 @@ struct kl_total_comm_cost_function {
 
                 unsigned idx = start_idx(source_step, start_step);
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) { 
-                        const cost_t x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_source_comm_cost, comm_gain); 
+                    for (const unsigned p : proc_range->compatible_processors_vertex(source)) {
+                        const cost_t x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_source_comm_cost, comm_gain);
                         const cost_t y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_source_comm_cost, comm_gain);
-                        affinity_table_source[p][idx] += x - y;  
+                        affinity_table_source[p][idx] += x - y;
                     }
                 }
             }
-        } 
+        }
     }
 
     inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0; }
     inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return (node_step + window_size <= end_step) ? window_range : window_range - (node_step + window_size - end_step); }
 
-    inline cost_t change_comm_cost(const v_commw_t<Graph_t> &p_target_comm_cost, const v_commw_t<Graph_t> &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0;}
+    inline cost_t change_comm_cost(const v_commw_t<Graph_t> &p_target_comm_cost, const v_commw_t<Graph_t> &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0; }
 
     template<typename affinity_table_t>
-    void compute_comm_affinity(VertexType node, affinity_table_t& affinity_table_node, const cost_t& penalty, const cost_t& reward, const unsigned start_step, const unsigned end_step) {
+    void compute_comm_affinity(VertexType node, affinity_table_t &affinity_table_node, const cost_t &penalty, const cost_t &reward, const unsigned start_step, const unsigned end_step) {
         const unsigned node_step = active_schedule->assigned_superstep(node);
         const unsigned node_proc = active_schedule->assigned_processor(node);
         const unsigned window_bound = end_idx(node_step, end_step);
@@ -332,37 +332,37 @@ struct kl_total_comm_cost_function {
 
         for (const auto &target : instance->getComputationalDag().children(node)) {
             const unsigned target_step = active_schedule->assigned_superstep(target);
-            const unsigned target_proc = active_schedule->assigned_processor(target); 
+            const unsigned target_proc = active_schedule->assigned_processor(target);
 
             if (target_step < node_step + (target_proc != node_proc)) {
-                const unsigned diff = node_step - target_step; 
-                const unsigned bound = window_size > diff ? window_size - diff : 0; 
+                const unsigned diff = node_step - target_step;
+                const unsigned bound = window_size > diff ? window_size - diff : 0;
                 unsigned idx = node_start_idx;
 
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                        
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
                         affinity_table_node[p][idx] -= reward;
-                    } 
+                    }
                 }
 
                 if (window_size >= diff && is_compatible(node, target_proc)) {
-                    affinity_table_node[target_proc][idx] -= reward;    
-                }  
+                    affinity_table_node[target_proc][idx] -= reward;
+                }
 
-            } else {  
+            } else {
                 const unsigned diff = target_step - node_step;
                 unsigned idx = window_size + diff;
 
                 if (idx < window_bound && is_compatible(node, target_proc)) {
-                    affinity_table_node[target_proc][idx] -= penalty; 
+                    affinity_table_node[target_proc][idx] -= penalty;
                 }
 
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                      
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
                         affinity_table_node[p][idx] += penalty;
-                    }                        
-                }                     
-            }    
+                    }
+                }
+            }
 
             const cost_t comm_gain = graph->vertex_comm_weight(node) * comm_multiplier;
             const auto node_target_comm_cost = instance->communicationCosts(node_proc, target_proc);
@@ -378,21 +378,21 @@ struct kl_total_comm_cost_function {
 
         for (const auto &source : instance->getComputationalDag().parents(node)) {
             const unsigned source_step = active_schedule->assigned_superstep(source);
-            const unsigned source_proc = active_schedule->assigned_processor(source);  
+            const unsigned source_proc = active_schedule->assigned_processor(source);
 
             if (source_step < node_step + (source_proc == node_proc)) {
-                const unsigned diff = node_step - source_step;                
-                const unsigned bound = window_size >= diff ? window_size - diff + 1: 0;  
+                const unsigned diff = node_step - source_step;
+                const unsigned bound = window_size >= diff ? window_size - diff + 1 : 0;
                 unsigned idx = node_start_idx;
 
                 for (; idx < bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                        
-                        affinity_table_node[p][idx] += penalty; 
-                    }                                                
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
+                        affinity_table_node[p][idx] += penalty;
+                    }
                 }
 
                 if (idx - 1 < bound && is_compatible(node, source_proc)) {
-                    affinity_table_node[source_proc][idx - 1] -= penalty;    
+                    affinity_table_node[source_proc][idx - 1] -= penalty;
                 }
 
             } else {
@@ -400,22 +400,22 @@ struct kl_total_comm_cost_function {
                 unsigned idx = std::min(window_size + diff, window_bound);
 
                 if (idx < window_bound && is_compatible(node, source_proc)) {
-                    affinity_table_node[source_proc][idx] -= reward;  
+                    affinity_table_node[source_proc][idx] -= reward;
                 }
-                
+
                 idx++;
 
                 for (; idx < window_bound; idx++) {
-                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                        
+                    for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
                         affinity_table_node[p][idx] -= reward;
-                    }                        
-                } 
+                    }
+                }
             }
 
             const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier;
             const auto source_node_comm_cost = instance->communicationCosts(source_proc, node_proc);
 
-            for (const unsigned p : proc_range->compatible_processors_vertex(node)) {   
+            for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
                 const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, source_proc), source_node_comm_cost, comm_gain);
                 for (unsigned idx = node_start_idx; idx < window_bound; idx++) {
                     affinity_table_node[p][idx] += comm_cost;
@@ -426,4 +426,3 @@ struct kl_total_comm_cost_function {
 };
 
 } // namespace osp
-
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_cut_cost.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_cut_cost.hpp
deleted file mode 100644
index f13abda9..00000000
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/comm_cost_modules/kl_total_cut_cost.hpp
+++ /dev/null
@@ -1,431 +0,0 @@
-// /*
-// Copyright 2024 Huawei Technologies Co., Ltd.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
-// */
-
-// #pragma once
-
-// #include "../kl_active_schedule.hpp"
-// #include "../kl_improver.hpp"
-
-// namespace osp {
-// template<typename Graph_t, typename cost_t, typename MemoryConstraint_t, unsigned window_size = 1, bool use_node_communication_costs_arg = true> 
-// struct kl_total_cut_cost_function {
-    
-//     using VertexType = vertex_idx_t<Graph_t>;
-//     using kl_move = kl_move_struct<cost_t, VertexType>;
-//     using kl_gain_update_info = kl_update_info<VertexType>;
-    
-//     constexpr static unsigned window_range = 2 * window_size + 1;
-//     constexpr static bool use_node_communication_costs = use_node_communication_costs_arg || not has_edge_weights_v<Graph_t>;
-     
-//     kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> *active_schedule;
-
-//     compatible_processor_range<Graph_t> *proc_range;
-
-//     const Graph_t *graph;
-//     const BspInstance<Graph_t> *instance;
-
-//     cost_t comm_multiplier = 1; 
-//     cost_t max_comm_weight = 0;
-
-//     inline cost_t get_comm_multiplier() { return comm_multiplier; }
-//     inline cost_t get_max_comm_weight() { return max_comm_weight; }
-//     inline cost_t get_max_comm_weight_multiplied() { return max_comm_weight * comm_multiplier; }
-
-//     const std::string name() const { return "toal_comm_cost"; }
-
-//     inline bool is_compatible(VertexType node, unsigned proc) { return active_schedule->getInstance().isCompatible(node, proc); }
-
-//     void initialize(kl_active_schedule<Graph_t, cost_t, MemoryConstraint_t> &sched, compatible_processor_range<Graph_t> &p_range) {
-//         active_schedule = &sched;
-//         proc_range = &p_range;
-//         instance = &sched.getInstance();
-//         graph = &instance->getComputationalDag();
-//         comm_multiplier = 1.0 / instance->numberOfProcessors();        
-//     }
-
-//     cost_t compute_schedule_cost_test() {
-//         return compute_schedule_cost();
-//     }
-
-//     void update_datastructure_after_move(const kl_move&, const unsigned, const unsigned) {}
-
-//     cost_t compute_schedule_cost() {
-
-//         cost_t work_costs = 0;
-//         for (unsigned step = 0; step < active_schedule->num_steps(); step++) {
-//             work_costs += active_schedule->get_step_max_work(step);
-//         }
-
-//         cost_t comm_costs = 0;
-//         for (const auto &edge : edges(*graph)) {
-
-//             const auto &source_v = source(edge, *graph);
-//             const auto &target_v = target(edge, *graph);
-
-//             const unsigned &source_proc = active_schedule->assigned_processor(source_v);
-//             const unsigned &target_proc = active_schedule->assigned_processor(target_v);
-
-//             if ((source_proc != target_proc) || (active_schedule->assigned_superstep(source_v) != active_schedule->assigned_superstep(target_v))) {
-
-//                 if constexpr (use_node_communication_costs) {
-//                     const cost_t source_comm_cost = graph->vertex_comm_weight(source_v); 
-//                     max_comm_weight = std::max(max_comm_weight, source_comm_cost);
-//                     comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc);
-//                 } else {
-//                     const cost_t source_comm_cost = graph->edge_comm_weight(edge);
-//                     max_comm_weight = std::max(max_comm_weight, source_comm_cost);
-//                     comm_costs += source_comm_cost * instance->communicationCosts(source_proc, target_proc);
-//                 }
-//             }
-//         }  
-
-//         return work_costs + comm_costs * comm_multiplier + static_cast<v_commw_t<Graph_t>>(active_schedule->num_steps() - 1) * instance->synchronisationCosts();
-//     }
-
-//     template<typename thread_data_t>
-//     void update_node_comm_affinity(const kl_move &move, thread_data_t& thread_data, const cost_t& penalty, const cost_t& reward, std::map<VertexType, kl_gain_update_info> & max_gain_recompute, std::vector<VertexType> &new_nodes) {
-         
-//         const unsigned & start_step = thread_data.start_step; 
-//         const unsigned & end_step = thread_data.end_step;
-
-//         for (const auto &target : instance->getComputationalDag().children(move.node)) {
-
-//             const unsigned target_step = active_schedule->assigned_superstep(target); 
-//             if (target_step < start_step || target_step > end_step)
-//                 continue;
-
-//             if(thread_data.lock_manager.is_locked(target))
-//                 continue;
-
-//             if (not thread_data.affinity_table.is_selected(target)) {
-//                 new_nodes.push_back(target);  
-//                 continue;
-//             }
-
-//             if (max_gain_recompute.find(target) != max_gain_recompute.end()) {
-//                 max_gain_recompute[target].full_update = true;                
-//             } else {
-//                 max_gain_recompute[target] = kl_gain_update_info(target, true);
-//             }           
-
-//             const unsigned target_proc = active_schedule->assigned_processor(target);
-//             const unsigned target_start_idx = start_idx(target_step, start_step);             
-//             auto & affinity_table_target = thread_data.affinity_table.at(target);
-
-//             if (move.from_step < target_step + (move.from_proc == target_proc)) {
-
-//                 const unsigned diff = target_step - move.from_step;                
-//                 const unsigned bound = window_size >= diff ? window_size - diff + 1: 0;  
-//                 unsigned idx = target_start_idx; 
-//                 for (; idx < bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
-//                         affinity_table_target[p][idx] -= penalty;
-//                     }                                                
-//                 } 
-
-//                 if (idx - 1 < bound && is_compatible(target, move.from_proc)) {
-//                     affinity_table_target[move.from_proc][idx - 1] += penalty;    
-//                 }
-
-//             } else {
-
-//                 const unsigned diff = move.from_step - target_step;
-//                 const unsigned window_bound = end_idx(target_step, end_step);  
-//                 unsigned idx = std::min(window_size + diff, window_bound);                  
-                
-//                 if (idx < window_bound && is_compatible(target, move.from_proc)) { 
-//                     affinity_table_target[move.from_proc][idx] += reward; 
-//                 }
-
-//                 idx++;
-                
-//                 for (; idx < window_bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
-//                         affinity_table_target[p][idx] += reward;
-//                     }                        
-//                 } 
-//             }
-
-//             if (move.to_step < target_step + (move.to_proc == target_proc)) {
-//                 unsigned idx = target_start_idx; 
-//                 const unsigned diff = target_step - move.to_step;                
-//                 const unsigned bound = window_size >= diff ? window_size - diff + 1: 0;  
-//                 for (; idx < bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
-//                         affinity_table_target[p][idx] += penalty;
-//                     }                                                
-//                 } 
-
-//                 if (idx - 1 < bound && is_compatible(target, move.to_proc)) {
-//                     affinity_table_target[move.to_proc][idx - 1] -= penalty;    
-//                 }
-
-//             } else {
-//                 const unsigned diff = move.to_step - target_step;
-//                 const unsigned window_bound = end_idx(target_step, end_step); 
-//                 unsigned idx = std::min(window_size + diff, window_bound);                                                     
-                
-//                 if (idx < window_bound && is_compatible(target, move.to_proc)) {
-//                     affinity_table_target[move.to_proc][idx] -= reward; 
-//                 }
-
-//                 idx++;
-                                    
-//                 for (; idx < window_bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(target)) { 
-//                         affinity_table_target[p][idx] -= reward;
-//                     }                        
-//                 } 
-//             }
-        
-//             if (move.to_proc != move.from_proc) {                
-//                 const auto from_proc_target_comm_cost = instance->communicationCosts(move.from_proc, target_proc);
-//                 const auto to_proc_target_comm_cost = instance->communicationCosts(move.to_proc, target_proc);
-
-//                 const cost_t comm_gain = graph->vertex_comm_weight(move.node) * comm_multiplier;
-
-//                 unsigned idx = target_start_idx;
-//                 const unsigned window_bound = end_idx(target_step, end_step);
-//                 for (; idx < window_bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(target)) {
-//                         const auto x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_target_comm_cost, comm_gain); 
-//                         const auto y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_target_comm_cost, comm_gain);
-//                         affinity_table_target[p][idx] += x - y;  
-//                     }
-//                 }
-//             } 
-//         }
-
-//         for (const auto &source : instance->getComputationalDag().parents(move.node)) {
-
-//             const unsigned source_step = active_schedule->assigned_superstep(source); 
-//             if (source_step < start_step || source_step > end_step)
-//                 continue;
-
-//             if(thread_data.lock_manager.is_locked(source))
-//                 continue;
-
-//             if (not thread_data.affinity_table.is_selected(source)) {
-//                 new_nodes.push_back(source);
-//                 continue;
-//             }
-
-//             if (max_gain_recompute.find(source) != max_gain_recompute.end()) {
-//                 max_gain_recompute[source].full_update = true;                
-//             } else {
-//                 max_gain_recompute[source] = kl_gain_update_info(source, true);
-//             } 
-
-//             const unsigned source_proc = active_schedule->assigned_processor(source);
-//             const unsigned window_bound = end_idx(source_step, end_step);
-//             auto & affinity_table_source = thread_data.affinity_table.at(source);
-
-//             if (move.from_step < source_step + (move.from_proc != source_proc)) {
-
-//                 const unsigned diff = source_step - move.from_step; 
-//                 const unsigned bound = window_size > diff ? window_size - diff : 0; 
-//                 unsigned idx = start_idx(source_step, start_step);
-//                 for (; idx < bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(source)) {  
-//                         affinity_table_source[p][idx] += reward;
-//                     } 
-//                 }
-
-//                 if (window_size >= diff && is_compatible(source, move.from_proc)) {
-//                     affinity_table_source[move.from_proc][idx] += reward;    
-//                 }
-
-//             } else {       
-
-//                 const unsigned diff = move.from_step - source_step;
-//                 unsigned idx = window_size + diff; 
-                
-//                 if (idx < window_bound && is_compatible(source, move.from_proc)) {
-//                     affinity_table_source[move.from_proc][idx] += penalty;                        
-//                 }
-
-//                 for (; idx < window_bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(source)) { 
-//                         affinity_table_source[p][idx] -= penalty;
-//                     }                        
-//                 }                     
-//             }
-
-//             if (move.to_step < source_step + (move.to_proc != source_proc)) {
-//                 const unsigned diff = source_step - move.to_step; 
-//                 const unsigned bound = window_size > diff ? window_size - diff : 0; 
-//                 unsigned idx = start_idx(source_step, start_step);
-//                 for (; idx < bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(source)) {  
-//                         affinity_table_source[p][idx] -= reward;
-//                     } 
-//                 }
-
-//                 if (window_size >= diff && is_compatible(source, move.to_proc)) {
-//                     affinity_table_source[move.to_proc][idx] -= reward;    
-//                 }
-
-//             } else {  
-//                 const unsigned diff = move.to_step - source_step;
-//                 unsigned idx = window_size + diff; 
-
-//                 if (idx < window_bound && is_compatible(source, move.to_proc)) {
-//                     affinity_table_source[move.to_proc][idx] -= penalty;                         
-//                 }
-//                 for (; idx < window_bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(source)) { 
-//                         affinity_table_source[p][idx] += penalty;
-//                     }                        
-//                 }                     
-//             }  
-
-//             if (move.to_proc != move.from_proc) {                
-//                 const auto from_proc_source_comm_cost = instance->communicationCosts(source_proc, move.from_proc);
-//                 const auto to_proc_source_comm_cost = instance->communicationCosts(source_proc, move.to_proc);
-
-//                 const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier;
-
-//                 unsigned idx = start_idx(source_step, start_step);
-//                 const unsigned window_bound = end_idx(source_step, end_step);
-//                 for (; idx < window_bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(source)) { 
-//                         const cost_t x = change_comm_cost(instance->communicationCosts(p, move.to_proc), to_proc_source_comm_cost, comm_gain); 
-//                         const cost_t y = change_comm_cost(instance->communicationCosts(p, move.from_proc), from_proc_source_comm_cost, comm_gain);
-//                         affinity_table_source[p][idx] += x - y;  
-//                     }
-//                 }
-//             }
-//         } 
-//     }
-
-//     inline unsigned start_idx(const unsigned node_step, const unsigned start_step) { return (node_step < window_size + start_step) ? window_size - (node_step - start_step) : 0; }
-//     inline unsigned end_idx(const unsigned node_step, const unsigned end_step) { return (node_step + window_size <= end_step) ? window_range : window_range - (node_step + window_size - end_step); }
-
-//     inline cost_t change_comm_cost(const v_commw_t<Graph_t> &p_target_comm_cost, const v_commw_t<Graph_t> &node_target_comm_cost, const cost_t &comm_gain) { return p_target_comm_cost > node_target_comm_cost ? (p_target_comm_cost - node_target_comm_cost) * comm_gain : (node_target_comm_cost - p_target_comm_cost) * comm_gain * -1.0;}
-
-//     template<typename affinity_table_t>
-//     void compute_comm_affinity(VertexType node, affinity_table_t& affinity_table_node, const cost_t& penalty, const cost_t& reward, const unsigned start_step, const unsigned end_step) {
-//         const unsigned node_step = active_schedule->assigned_superstep(node);
-//         const unsigned node_proc = active_schedule->assigned_processor(node);
-//         const unsigned window_bound = end_idx(node_step, end_step);
-//         const unsigned node_start_idx = start_idx(node_step, start_step);
-
-//         for (const auto &target : instance->getComputationalDag().children(node)) {
-//             const unsigned target_step = active_schedule->assigned_superstep(target);
-//             const unsigned target_proc = active_schedule->assigned_processor(target); 
-
-//             if (target_step < node_step + (target_proc != node_proc)) {
-//                 const unsigned diff = node_step - target_step; 
-//                 const unsigned bound = window_size > diff ? window_size - diff : 0; 
-//                 unsigned idx = node_start_idx;
-
-//                 for (; idx < bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                        
-//                         affinity_table_node[p][idx] -= reward;
-//                     } 
-//                 }
-
-//                 if (window_size >= diff && is_compatible(node, target_proc)) {
-//                     affinity_table_node[target_proc][idx] -= reward;    
-//                 }  
-
-//             } else {  
-//                 const unsigned diff = target_step - node_step;
-//                 unsigned idx = window_size + diff;
-
-//                 if (idx < window_bound && is_compatible(node, target_proc)) {
-//                     affinity_table_node[target_proc][idx] -= penalty; 
-//                 }
-
-//                 for (; idx < window_bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                      
-//                         affinity_table_node[p][idx] += penalty;
-//                     }                        
-//                 }                     
-//             }    
-
-//             const cost_t comm_gain = graph->vertex_comm_weight(node) * comm_multiplier;
-//             const auto node_target_comm_cost = instance->communicationCosts(node_proc, target_proc);
-
-//             for (const unsigned p : proc_range->compatible_processors_vertex(node)) {
-//                 if (p != target_proc) {
-//                     const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, target_proc), node_target_comm_cost, comm_gain);
-//                     for (unsigned idx = node_start_idx; idx < window_bound; idx++) {
-//                         affinity_table_node[p][idx] += comm_cost;
-//                     }
-//                 } else {
-//                     for (unsigned idx = node_start_idx; idx < window_bound; idx++) {
-//                         if(idx == 0) continue;
-//                         affinity_table_node[p][idx] += comm_gain;
-//                     } 
-//                 }
-//             }
-
-//         } // traget
-
-//         for (const auto &source : instance->getComputationalDag().parents(node)) {
-//             const unsigned source_step = active_schedule->assigned_superstep(source);
-//             const unsigned source_proc = active_schedule->assigned_processor(source);  
-
-//             if (source_step < node_step + (source_proc == node_proc)) {
-//                 const unsigned diff = node_step - source_step;                
-//                 const unsigned bound = window_size >= diff ? window_size - diff + 1: 0;  
-//                 unsigned idx = node_start_idx;
-
-//                 for (; idx < bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                        
-//                         affinity_table_node[p][idx] += penalty; 
-//                     }                                                
-//                 }
-
-//                 if (idx - 1 < bound && is_compatible(node, source_proc)) {
-//                     affinity_table_node[source_proc][idx - 1] -= penalty;    
-//                 }
-
-//             } else {
-//                 const unsigned diff = source_step - node_step;
-//                 unsigned idx = std::min(window_size + diff, window_bound);
-
-//                 if (idx < window_bound && is_compatible(node, source_proc)) {
-//                     affinity_table_node[source_proc][idx] -= reward;  
-//                 }
-                
-//                 idx++;
-
-//                 for (; idx < window_bound; idx++) {
-//                     for (const unsigned p : proc_range->compatible_processors_vertex(node)) {                        
-//                         affinity_table_node[p][idx] -= reward;
-//                     }                        
-//                 } 
-//             }
-
-//             const cost_t comm_gain = graph->vertex_comm_weight(source) * comm_multiplier;
-//             const auto source_node_comm_cost = instance->communicationCosts(source_proc, node_proc);
-
-//             for (const unsigned p : proc_range->compatible_processors_vertex(node)) {   
-//                 const cost_t comm_cost = change_comm_cost(instance->communicationCosts(p, source_proc), source_node_comm_cost, comm_gain);
-//                 for (unsigned idx = node_start_idx; idx < window_bound; idx++) {
-//                     affinity_table_node[p][idx] += comm_cost;
-//                 }
-//             }
-//         } // source
-//     }
-// };
-
-// } // namespace osp
-
diff --git a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp
index 97bd35a7..3657ed52 100644
--- a/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp
+++ b/include/osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver.hpp
@@ -152,7 +152,7 @@ class kl_improver : public ImprovementScheduler<Graph_t> {
     const Graph_t *graph;
     const BspInstance<Graph_t> *instance;
 
-    compatible_processor_range<Graph_t> proc_range;
+    CompatibleProcessorRange<Graph_t> proc_range;
 
     kl_parameter parameters;
     std::mt19937 gen;
diff --git a/include/osp/dag_divider/AbstractWavefrontScheduler.hpp b/include/osp/dag_divider/AbstractWavefrontScheduler.hpp
index 556e82bc..69a3c80c 100644
--- a/include/osp/dag_divider/AbstractWavefrontScheduler.hpp
+++ b/include/osp/dag_divider/AbstractWavefrontScheduler.hpp
@@ -21,10 +21,10 @@ limitations under the License.
 #include "osp/graph_algorithms/computational_dag_util.hpp"
 #include "osp/graph_algorithms/subgraph_algorithms.hpp"
 #include "osp/graph_implementations/boost_graphs/boost_graph.hpp"
-#include <numeric>
 #include <algorithm>
-#include <iostream>
 #include <cassert>
+#include <iostream>
+#include <numeric>
 
 namespace osp {
 
@@ -34,7 +34,7 @@ namespace osp {
  */
 template<typename Graph_t, typename constr_graph_t>
 class AbstractWavefrontScheduler : public Scheduler<Graph_t> {
-protected:
+  protected:
     IDagDivider<Graph_t> *divider;
     Scheduler<constr_graph_t> *scheduler;
     static constexpr bool enable_debug_prints = true;
@@ -46,17 +46,17 @@ class AbstractWavefrontScheduler : public Scheduler<Graph_t> {
      */
     bool distributeProcessors(
         unsigned total_processors_of_type,
-        const std::vector<double>& work_weights,
-        std::vector<unsigned>& allocation) const {
-        
+        const std::vector<double> &work_weights,
+        std::vector<unsigned> &allocation) const {
+
         allocation.assign(work_weights.size(), 0);
         double total_work = std::accumulate(work_weights.begin(), work_weights.end(), 0.0);
         if (total_work <= 1e-9 || total_processors_of_type == 0) {
             return false;
         }
-        
+
         std::vector<size_t> active_indices;
-        for(size_t i = 0; i < work_weights.size(); ++i) {
+        for (size_t i = 0; i < work_weights.size(); ++i) {
             if (work_weights[i] > 1e-9) {
                 active_indices.push_back(i);
             }
@@ -68,7 +68,7 @@ class AbstractWavefrontScheduler : public Scheduler<Graph_t> {
 
         size_t num_active_components = active_indices.size();
         unsigned remaining_procs = total_processors_of_type;
-        
+
         // --- Stage 1: Guarantee at least one processor if possible (anti-starvation) ---
         if (total_processors_of_type >= num_active_components) {
             // Abundance case: Give one processor to each active component first.
@@ -79,11 +79,11 @@ class AbstractWavefrontScheduler : public Scheduler<Graph_t> {
         } else {
             // Scarcity case: Not enough processors for each active component.
             std::vector<std::pair<double, size_t>> sorted_work;
-            for(size_t idx : active_indices) {
+            for (size_t idx : active_indices) {
                 sorted_work.push_back({work_weights[idx], idx});
             }
             std::sort(sorted_work.rbegin(), sorted_work.rend());
-            for(unsigned i = 0; i < remaining_procs; ++i) {
+            for (unsigned i = 0; i < remaining_procs; ++i) {
                 allocation[sorted_work[i].second]++;
             }
             return true; // Scarcity case was hit.
@@ -93,10 +93,10 @@ class AbstractWavefrontScheduler : public Scheduler<Graph_t> {
         if (remaining_procs > 0) {
             std::vector<double> adjusted_work_weights;
             double adjusted_total_work = 0;
-            
+
             double work_per_proc = total_work / static_cast<double>(total_processors_of_type);
 
-            for(size_t idx : active_indices) {
+            for (size_t idx : active_indices) {
                 double adjusted_work = std::max(0.0, work_weights[idx] - work_per_proc);
                 adjusted_work_weights.push_back(adjusted_work);
                 adjusted_total_work += adjusted_work;
@@ -123,14 +123,13 @@ class AbstractWavefrontScheduler : public Scheduler<Graph_t> {
                     }
                 }
             }
-        }        
+        }
         return false; // Scarcity case was not hit.
     }
 
-
     BspArchitecture<constr_graph_t> createSubArchitecture(
         const BspArchitecture<Graph_t> &original_arch,
-        const std::vector<unsigned>& sub_dag_proc_types) const {
+        const std::vector<unsigned> &sub_dag_proc_types) const {
 
         // The calculation is now inside the assert, so it only happens in debug builds.
         assert(std::accumulate(sub_dag_proc_types.begin(), sub_dag_proc_types.end(), 0u) > 0 && "Attempted to create a sub-architecture with zero processors.");
@@ -142,33 +141,34 @@ class AbstractWavefrontScheduler : public Scheduler<Graph_t> {
             sub_dag_processor_memory[original_arch.processorType(i)] =
                 std::min(original_arch.memoryBound(i), sub_dag_processor_memory[original_arch.processorType(i)]);
         }
-        sub_architecture.set_processors_consequ_types(sub_dag_proc_types, sub_dag_processor_memory);
+        sub_architecture.SetProcessorsConsequTypes(sub_dag_proc_types, sub_dag_processor_memory);
         return sub_architecture;
     }
 
-    bool validateWorkDistribution(const std::vector<constr_graph_t>& sub_dags, const BspInstance<Graph_t>& instance) const {
-        const auto& original_arch = instance.getArchitecture();
-        for (const auto& rep_sub_dag : sub_dags) {
+    bool validateWorkDistribution(const std::vector<constr_graph_t> &sub_dags, const BspInstance<Graph_t> &instance) const {
+        const auto &original_arch = instance.getArchitecture();
+        for (const auto &rep_sub_dag : sub_dags) {
             const double total_rep_work = sumOfVerticesWorkWeights(rep_sub_dag);
-            
+
             double sum_of_compatible_works_for_rep = 0.0;
             for (unsigned type_idx = 0; type_idx < original_arch.getNumberOfProcessorTypes(); ++type_idx) {
                 sum_of_compatible_works_for_rep += sumOfCompatibleWorkWeights(rep_sub_dag, instance, type_idx);
             }
 
             if (sum_of_compatible_works_for_rep > total_rep_work + 1e-9) {
-                if constexpr (enable_debug_prints) std::cerr << "ERROR: Sum of compatible work (" << sum_of_compatible_works_for_rep 
-                                              << ") exceeds total work (" << total_rep_work 
-                                              << ") for a sub-dag. Aborting." << std::endl;
+                if constexpr (enable_debug_prints)
+                    std::cerr << "ERROR: Sum of compatible work (" << sum_of_compatible_works_for_rep
+                              << ") exceeds total work (" << total_rep_work
+                              << ") for a sub-dag. Aborting." << std::endl;
                 return false;
             }
         }
         return true;
     }
 
-public:
+  public:
     AbstractWavefrontScheduler(IDagDivider<Graph_t> &div, Scheduler<constr_graph_t> &sched)
         : divider(&div), scheduler(&sched) {}
 };
 
-}
+} // namespace osp
diff --git a/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp b/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp
index 5ba326d9..d1d61016 100644
--- a/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp
+++ b/include/osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp
@@ -16,22 +16,22 @@ limitations under the License.
 @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
 */
 
-#include <iostream>
-#include <chrono>
-#include <ctime>
-#include <iomanip>
-#include "OrbitGraphProcessor.hpp"
+#include "EftSubgraphScheduler.hpp"
 #include "HashComputer.hpp"
 #include "MerkleHashComputer.hpp"
-#include "EftSubgraphScheduler.hpp"
+#include "OrbitGraphProcessor.hpp"
 #include "TrimmedGroupScheduler.hpp"
 #include "osp/auxiliary/io/DotFileWriter.hpp"
 #include "osp/bsp/scheduler/Scheduler.hpp"
 #include "osp/graph_algorithms/subgraph_algorithms.hpp"
+#include <chrono>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
 
 namespace osp {
 
-/** 
+/**
  * @brief A scheduler that leverages isomorphic subgraphs to partition a DAG.
  *
  * @class IsomorphicSubgraphScheduler
@@ -58,12 +58,11 @@ class IsomorphicSubgraphScheduler {
     static_assert(std::is_same_v<vertex_idx_t<Graph_t>, vertex_idx_t<Constr_Graph_t>>,
                   "Graph_t and Constr_Graph_t must have the same vertex_idx types");
 
-    private:
-
-    static constexpr bool verbose = false;    
-    const HashComputer<vertex_idx_t<Graph_t>>* hash_computer_;
+  private:
+    static constexpr bool verbose = false;
+    const HashComputer<vertex_idx_t<Graph_t>> *hash_computer_;
     size_t symmetry_ = 4;
-    Scheduler<Constr_Graph_t> * bsp_scheduler_;
+    Scheduler<Constr_Graph_t> *bsp_scheduler_;
     bool use_max_group_size_ = false;
     unsigned max_group_size_ = 0;
     bool plot_dot_graphs_ = false;
@@ -76,22 +75,21 @@ class IsomorphicSubgraphScheduler {
     bool use_max_bsp = false;
     bool use_adaptive_symmetry_threshold = true;
 
-    public:
-
-    explicit IsomorphicSubgraphScheduler(Scheduler<Constr_Graph_t> & bsp_scheduler) 
+  public:
+    explicit IsomorphicSubgraphScheduler(Scheduler<Constr_Graph_t> &bsp_scheduler)
         : hash_computer_(nullptr), bsp_scheduler_(&bsp_scheduler), plot_dot_graphs_(false) {}
 
-    IsomorphicSubgraphScheduler(Scheduler<Constr_Graph_t> & bsp_scheduler, const HashComputer<vertex_idx_t<Graph_t>>& hash_computer) 
+    IsomorphicSubgraphScheduler(Scheduler<Constr_Graph_t> &bsp_scheduler, const HashComputer<vertex_idx_t<Graph_t>> &hash_computer)
         : hash_computer_(&hash_computer), bsp_scheduler_(&bsp_scheduler), plot_dot_graphs_(false) {}
 
     virtual ~IsomorphicSubgraphScheduler() {}
 
-    void setMergeDifferentTypes(bool flag) {merge_different_node_types = flag;}
-    void setWorkThreshold(v_workw_t<Constr_Graph_t> work_threshold) {work_threshold_ = work_threshold;}
-    void setCriticalPathThreshold(v_workw_t<Constr_Graph_t> critical_path_threshold) {critical_path_threshold_ = critical_path_threshold;}
-    void setOrbitLockRatio(double orbit_lock_ratio) {orbit_lock_ratio_ = orbit_lock_ratio;}
-    void setNaturalBreaksCountPercentage(double natural_breaks_count_percentage) {natural_breaks_count_percentage_ = natural_breaks_count_percentage;}
-    void setAllowTrimmedScheduler(bool flag) {allow_use_trimmed_scheduler = flag;}
+    void setMergeDifferentTypes(bool flag) { merge_different_node_types = flag; }
+    void setWorkThreshold(v_workw_t<Constr_Graph_t> work_threshold) { work_threshold_ = work_threshold; }
+    void setCriticalPathThreshold(v_workw_t<Constr_Graph_t> critical_path_threshold) { critical_path_threshold_ = critical_path_threshold; }
+    void setOrbitLockRatio(double orbit_lock_ratio) { orbit_lock_ratio_ = orbit_lock_ratio; }
+    void setNaturalBreaksCountPercentage(double natural_breaks_count_percentage) { natural_breaks_count_percentage_ = natural_breaks_count_percentage; }
+    void setAllowTrimmedScheduler(bool flag) { allow_use_trimmed_scheduler = flag; }
     void set_plot_dot_graphs(bool plot) { plot_dot_graphs_ = plot; }
     void disable_use_max_group_size() { use_max_group_size_ = false; }
     void setUseMaxBsp(bool flag) { use_max_bsp = flag; }
@@ -100,12 +98,12 @@ class IsomorphicSubgraphScheduler {
         max_group_size_ = max_group_size;
     }
     void setEnableAdaptiveSymmetryThreshold() { use_adaptive_symmetry_threshold = true; }
-    void setUseStaticSymmetryLevel(size_t static_symmetry_level) { 
-        use_adaptive_symmetry_threshold = false; 
-        symmetry_ = static_symmetry_level; 
+    void setUseStaticSymmetryLevel(size_t static_symmetry_level) {
+        use_adaptive_symmetry_threshold = false;
+        symmetry_ = static_symmetry_level;
     }
 
-    std::vector<vertex_idx_t<Graph_t>> compute_partition(const BspInstance<Graph_t>& instance) {
+    std::vector<vertex_idx_t<Graph_t>> compute_partition(const BspInstance<Graph_t> &instance) {
         OrbitGraphProcessor<Graph_t, Constr_Graph_t> orbit_processor;
         orbit_processor.set_work_threshold(work_threshold_);
         orbit_processor.setMergeDifferentNodeTypes(merge_different_node_types);
@@ -116,7 +114,7 @@ class IsomorphicSubgraphScheduler {
             orbit_processor.setUseStaticSymmetryLevel(symmetry_);
         }
 
-        std::unique_ptr<HashComputer<vertex_idx_t<Graph_t>>> local_hasher;      
+        std::unique_ptr<HashComputer<vertex_idx_t<Graph_t>>> local_hasher;
         if (!hash_computer_) {
             local_hasher = std::make_unique<MerkleHashComputer<Graph_t, bwd_merkle_node_hash_func<Graph_t>, true>>(instance.getComputationalDag(), instance.getComputationalDag());
             hash_computer_ = local_hasher.get();
@@ -125,7 +123,7 @@ class IsomorphicSubgraphScheduler {
         orbit_processor.discover_isomorphic_groups(instance.getComputationalDag(), *hash_computer_);
 
         auto isomorphic_groups = orbit_processor.get_final_groups();
-        
+
         std::vector<bool> was_trimmed(isomorphic_groups.size(), false);
         trim_subgraph_groups(isomorphic_groups, instance, was_trimmed); // Apply trimming and record which groups were affected
 
@@ -157,8 +155,7 @@ class IsomorphicSubgraphScheduler {
         return partition;
     }
 
-    protected:
-
+  protected:
     template<typename G_t, typename C_G_t>
     struct subgraph_scheduler_input {
         BspInstance<C_G_t> instance;
@@ -167,14 +164,14 @@ class IsomorphicSubgraphScheduler {
         std::vector<std::vector<v_workw_t<G_t>>> required_proc_types;
     };
 
-    void trim_subgraph_groups(std::vector<typename OrbitGraphProcessor<Graph_t, Constr_Graph_t>::Group>& isomorphic_groups,
-                              const BspInstance<Graph_t>& instance,
-                              std::vector<bool>& was_trimmed) {
+    void trim_subgraph_groups(std::vector<typename OrbitGraphProcessor<Graph_t, Constr_Graph_t>::Group> &isomorphic_groups,
+                              const BspInstance<Graph_t> &instance,
+                              std::vector<bool> &was_trimmed) {
         if constexpr (verbose) {
             std::cout << "\n--- Trimming Isomorphic Subgraph Groups ---" << std::endl;
         }
         for (size_t group_idx = 0; group_idx < isomorphic_groups.size(); ++group_idx) {
-            auto& group = isomorphic_groups[group_idx];
+            auto &group = isomorphic_groups[group_idx];
             const unsigned group_size = static_cast<unsigned>(group.size());
             if (group_size <= 1)
                 continue;
@@ -194,24 +191,24 @@ class IsomorphicSubgraphScheduler {
                 if constexpr (has_typed_vertices_v<Graph_t>) {
                     if (!group.subgraphs.empty() && !group.subgraphs[0].empty()) {
                         common_node_type = instance.getComputationalDag().vertex_type(group.subgraphs[0][0]);
-                        const auto& rep_subgraph = group.subgraphs[0];   
-                        for (const auto& vertex : rep_subgraph) {
+                        const auto &rep_subgraph = group.subgraphs[0];
+                        for (const auto &vertex : rep_subgraph) {
                             if (instance.getComputationalDag().vertex_type(vertex) != common_node_type) {
                                 is_single_type_group = false;
                                 break;
                             }
                         }
                     } else {
-                        is_single_type_group = false; 
+                        is_single_type_group = false;
                     }
                 } else {
-                    is_single_type_group = false; 
+                    is_single_type_group = false;
                 }
 
                 if (is_single_type_group) {
                     // Dynamically determine min_proc_type_count based on compatible processors for this type
                     unsigned min_compatible_processors = std::numeric_limits<unsigned>::max();
-                    const auto& proc_type_counts = instance.getArchitecture().getProcessorTypeCount();
+                    const auto &proc_type_counts = instance.getArchitecture().getProcessorTypeCount();
 
                     bool found_compatible_processor = false;
                     for (unsigned proc_type_idx = 0; proc_type_idx < proc_type_counts.size(); ++proc_type_idx) {
@@ -222,13 +219,13 @@ class IsomorphicSubgraphScheduler {
                     }
                     if (found_compatible_processor) {
                         if constexpr (verbose) {
-                            std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type 
+                            std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type
                                       << "). Min compatible processors: " << min_compatible_processors << "." << std::endl;
                         }
                         effective_min_proc_type_count = min_compatible_processors;
                     } else {
                         if constexpr (verbose) {
-                            std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type 
+                            std::cout << "Group " << group_idx << " (size " << group_size << "): Single node type (" << common_node_type
                                       << ") but no compatible processors found. Disabling trimming." << std::endl;
                         }
                         // If no compatible processors found for this type, effectively disable trimming for this group.
@@ -236,7 +233,11 @@ class IsomorphicSubgraphScheduler {
                     }
                 } else {
                     // Fallback to a default min_proc_type_count if not a single-type group or no typed vertices.
-                    effective_min_proc_type_count = instance.getArchitecture().getMinProcessorTypeCount();
+                    const auto &type_count = instance.getArchitecture().getProcessorTypeCount();
+                    if (type_count.empty()) {
+                        effective_min_proc_type_count = 0;
+                    }
+                    effective_min_proc_type_count = *std::min_element(type_count.begin(), type_count.end());
                     if constexpr (verbose) {
                         std::cout << "Group " << group_idx << " (size " << group_size << "): Multi-type or untyped group. Using default min_proc_type_count: " << effective_min_proc_type_count << "." << std::endl;
                     }
@@ -257,13 +258,13 @@ class IsomorphicSubgraphScheduler {
 
             if (gcd < group_size) {
                 if constexpr (verbose) {
-                    std::cout << "  -> Trimming group " << group_idx << ". GCD(" << group_size << ", " << effective_min_proc_type_count 
+                    std::cout << "  -> Trimming group " << group_idx << ". GCD(" << group_size << ", " << effective_min_proc_type_count
                               << ") = " << gcd << ". Merging " << group_size / gcd << " subgraphs at a time." << std::endl;
                 }
 
                 if (allow_use_trimmed_scheduler)
                     gcd = 1;
-                
+
                 was_trimmed[group_idx] = true;
                 const unsigned merge_size = group_size / gcd;
                 std::vector<std::vector<vertex_idx_t<Graph_t>>> new_subgraphs;
@@ -279,7 +280,7 @@ class IsomorphicSubgraphScheduler {
                     }
 
                     for (unsigned k = 0; k < merge_size; ++k) {
-                        const auto& sg_to_merge_vertices = group.subgraphs[original_sg_cursor];
+                        const auto &sg_to_merge_vertices = group.subgraphs[original_sg_cursor];
                         original_sg_cursor++;
                         merged_sg_vertices.insert(merged_sg_vertices.end(), sg_to_merge_vertices.begin(), sg_to_merge_vertices.end());
                     }
@@ -292,14 +293,14 @@ class IsomorphicSubgraphScheduler {
                 }
                 was_trimmed[group_idx] = false;
             }
-       }
+        }
     }
 
     subgraph_scheduler_input<Graph_t, Constr_Graph_t> prepare_subgraph_scheduling_input(
-        const BspInstance<Graph_t>& original_instance,
-        const std::vector<typename OrbitGraphProcessor<Graph_t, Constr_Graph_t>::Group>& isomorphic_groups,
-        const std::vector<bool>& was_trimmed) {
-        
+        const BspInstance<Graph_t> &original_instance,
+        const std::vector<typename OrbitGraphProcessor<Graph_t, Constr_Graph_t>::Group> &isomorphic_groups,
+        const std::vector<bool> &was_trimmed) {
+
         subgraph_scheduler_input<Graph_t, Constr_Graph_t> result;
         result.instance.setArchitecture(original_instance.getArchitecture());
         const unsigned num_proc_types = original_instance.getArchitecture().getNumberOfProcessorTypes();
@@ -332,35 +333,35 @@ class IsomorphicSubgraphScheduler {
             ++coarse_node_idx;
         }
         coarser_util::construct_coarse_dag(original_instance.getComputationalDag(), result.instance.getComputationalDag(),
-                                        contraction_map);
+                                           contraction_map);
 
         if constexpr (verbose) {
             std::cout << "\n--- Preparing Subgraph Scheduling Input ---\n";
             std::cout << "Found " << isomorphic_groups.size() << " isomorphic groups to schedule as coarse nodes.\n";
             for (size_t j = 0; j < isomorphic_groups.size(); ++j) {
                 std::cout << "  - Coarse Node " << j << " (from " << isomorphic_groups[j].subgraphs.size()
-                        << " isomorphic subgraphs):\n";
+                          << " isomorphic subgraphs):\n";
                 std::cout << "    - Multiplicity for scheduling: " << result.multiplicities[j] << "\n";
                 std::cout << "    - Total Work (in coarse graph): " << result.instance.getComputationalDag().vertex_work_weight(j) << "\n";
                 std::cout << "    - Required Processor Types: ";
                 for (unsigned k = 0; k < num_proc_types; ++k) {
                     std::cout << result.required_proc_types[j][k] << " ";
                 }
-                std::cout << "\n";  
+                std::cout << "\n";
                 std::cout << "    - Max number of processors: " << result.max_num_processors[j] << "\n";
             }
         }
         return result;
     }
 
-    void schedule_isomorphic_group(const BspInstance<Graph_t>& instance, 
-                                   const std::vector<typename OrbitGraphProcessor<Graph_t, Constr_Graph_t>::Group>& isomorphic_groups, 
-                                   const SubgraphSchedule & sub_sched, 
-                                   std::vector<vertex_idx_t<Graph_t>> & partition) {
+    void schedule_isomorphic_group(const BspInstance<Graph_t> &instance,
+                                   const std::vector<typename OrbitGraphProcessor<Graph_t, Constr_Graph_t>::Group> &isomorphic_groups,
+                                   const SubgraphSchedule &sub_sched,
+                                   std::vector<vertex_idx_t<Graph_t>> &partition) {
         vertex_idx_t<Graph_t> current_partition_idx = 0;
 
         for (size_t group_idx = 0; group_idx < isomorphic_groups.size(); ++group_idx) {
-            const auto& group = isomorphic_groups[group_idx];
+            const auto &group = isomorphic_groups[group_idx];
             if (group.subgraphs.empty()) {
                 continue;
             }
@@ -373,47 +374,47 @@ class IsomorphicSubgraphScheduler {
             auto rep_global_to_local_map = create_induced_subgraph_map(instance.getComputationalDag(), representative_instance.getComputationalDag(), rep_subgraph_vertices_sorted);
 
             representative_instance.setArchitecture(instance.getArchitecture());
-            const auto& procs_for_group = sub_sched.node_assigned_worker_per_type[group_idx];
+            const auto &procs_for_group = sub_sched.node_assigned_worker_per_type[group_idx];
             std::vector<v_memw_t<Constr_Graph_t>> mem_weights(procs_for_group.size(), 0);
             for (unsigned proc_type = 0; proc_type < procs_for_group.size(); ++proc_type) {
                 mem_weights[proc_type] = static_cast<v_memw_t<Constr_Graph_t>>(instance.getArchitecture().maxMemoryBoundProcType(proc_type));
             }
-            representative_instance.getArchitecture().set_processors_consequ_types(procs_for_group, mem_weights);
+            representative_instance.getArchitecture().SetProcessorsConsequTypes(procs_for_group, mem_weights);
             representative_instance.setNodeProcessorCompatibility(instance.getProcessorCompatibilityMatrix());
 
             // --- Decide which scheduler to use ---
             unsigned min_non_zero_procs = std::numeric_limits<unsigned>::max();
-            for (const auto& proc_count : procs_for_group) {
+            for (const auto &proc_count : procs_for_group) {
                 if (proc_count > 0) {
                     min_non_zero_procs = std::min(min_non_zero_procs, proc_count);
                 }
             }
 
-
             bool use_trimmed_scheduler = sub_sched.was_trimmed[group_idx] && min_non_zero_procs > 1 && allow_use_trimmed_scheduler;
- 
-            Scheduler<Constr_Graph_t>* scheduler_for_group_ptr;
+
+            Scheduler<Constr_Graph_t> *scheduler_for_group_ptr;
             std::unique_ptr<Scheduler<Constr_Graph_t>> trimmed_scheduler_owner;
             if (use_trimmed_scheduler) {
-                if constexpr (verbose) std::cout << "Using TrimmedGroupScheduler for group " << group_idx << std::endl;
+                if constexpr (verbose)
+                    std::cout << "Using TrimmedGroupScheduler for group " << group_idx << std::endl;
                 trimmed_scheduler_owner = std::make_unique<TrimmedGroupScheduler<Constr_Graph_t>>(*bsp_scheduler_, min_non_zero_procs);
                 scheduler_for_group_ptr = trimmed_scheduler_owner.get();
             } else {
-                if constexpr (verbose) std::cout << "Using standard BSP scheduler for group " << group_idx << std::endl;
+                if constexpr (verbose)
+                    std::cout << "Using standard BSP scheduler for group " << group_idx << std::endl;
                 scheduler_for_group_ptr = bsp_scheduler_;
             }
 
-
             // --- Schedule the representative to get the pattern ---
             BspSchedule<Constr_Graph_t> bsp_schedule(representative_instance);
 
             if constexpr (verbose) {
                 std::cout << "--- Scheduling representative for group " << group_idx << " ---" << std::endl;
                 std::cout << "  Number of subgraphs in group: " << group.subgraphs.size() << std::endl;
-                const auto& rep_dag = representative_instance.getComputationalDag();
+                const auto &rep_dag = representative_instance.getComputationalDag();
                 std::cout << "  Representative subgraph size: " << rep_dag.num_vertices() << " vertices" << std::endl;
                 std::vector<unsigned> node_type_counts(rep_dag.num_vertex_types(), 0);
-                for (const auto& v : rep_dag.vertices()) {
+                for (const auto &v : rep_dag.vertices()) {
                     node_type_counts[rep_dag.vertex_type(v)]++;
                 }
                 std::cout << "    Node type counts: ";
@@ -424,45 +425,43 @@ class IsomorphicSubgraphScheduler {
                 }
                 std::cout << std::endl;
 
-                const auto& sub_arch = representative_instance.getArchitecture();
+                const auto &sub_arch = representative_instance.getArchitecture();
                 std::cout << "  Sub-architecture for scheduling:" << std::endl;
                 std::cout << "    Processors: " << sub_arch.numberOfProcessors() << std::endl;
                 std::cout << "    Processor types counts: ";
-                const auto& type_counts = sub_arch.getProcessorTypeCount();
+                const auto &type_counts = sub_arch.getProcessorTypeCount();
                 for (size_t type_idx = 0; type_idx < type_counts.size(); ++type_idx) {
                     std::cout << "T" << type_idx << ":" << type_counts[type_idx] << " ";
                 }
                 std::cout << std::endl;
                 std::cout << "    Sync cost: " << sub_arch.synchronisationCosts() << ", Comm cost: " << sub_arch.communicationCosts() << std::endl;
                 std::cout << "    Sub-problem compatibility matrix:" << std::endl;
-                const auto & sub_comp_matrix = representative_instance.getNodeNodeCompatabilityMatrix();
-                for(unsigned i = 0; i < sub_comp_matrix.size(); ++i) {
+                const auto &sub_comp_matrix = representative_instance.getNodeNodeCompatabilityMatrix();
+                for (unsigned i = 0; i < sub_comp_matrix.size(); ++i) {
                     std::cout << "      Node Type " << i << ": [ ";
                     for (unsigned j = 0; j < sub_comp_matrix[i].size(); ++j) {
                         std::cout << (sub_comp_matrix[i][j] ? "1" : "0") << " ";
                     }
                     std::cout << "]" << std::endl;
                 }
-
             }
-            
+
             scheduler_for_group_ptr->computeSchedule(bsp_schedule);
 
             if constexpr (verbose) {
-                std::cout << "  Schedule satisfies precedence constraints: ";  
+                std::cout << "  Schedule satisfies precedence constraints: ";
                 std::cout << bsp_schedule.satisfiesPrecedenceConstraints() << std::endl;
                 std::cout << "  Schedule satisfies node type constraints: ";
                 std::cout << bsp_schedule.satisfiesNodeTypeConstraints() << std::endl;
             }
-            
 
             if (plot_dot_graphs_) {
-                const auto& rep_dag = bsp_schedule.getInstance().getComputationalDag();
+                const auto &rep_dag = bsp_schedule.getInstance().getComputationalDag();
                 std::vector<unsigned> colors(rep_dag.num_vertices());
                 std::map<std::pair<unsigned, unsigned>, unsigned> proc_ss_to_color;
                 unsigned next_color = 0;
 
-                for (const auto& v : rep_dag.vertices()) {
+                for (const auto &v : rep_dag.vertices()) {
                     const auto assignment = std::make_pair(bsp_schedule.assignedProcessor(v), bsp_schedule.assignedSuperstep(v));
                     if (proc_ss_to_color.find(assignment) == proc_ss_to_color.end()) {
                         proc_ss_to_color[assignment] = next_color++;
@@ -476,12 +475,10 @@ class IsomorphicSubgraphScheduler {
                 ss << std::put_time(std::localtime(&in_time_t), "%Y%m%d_%H%M%S");
                 std::string timestamp = ss.str() + "_";
 
-
                 DotFileWriter writer;
                 writer.write_colored_graph(timestamp + "iso_group_rep_" + std::to_string(group_idx) + ".dot", rep_dag, colors);
             }
 
-
             const bool max_bsp = use_max_bsp && (representative_instance.getComputationalDag().num_edges() == 0) && (representative_instance.getComputationalDag().vertex_type(0) == 0);
 
             // Build data structures for applying the pattern ---
@@ -491,10 +488,9 @@ class IsomorphicSubgraphScheduler {
             for (vertex_idx_t<Graph_t> j = 0; j < static_cast<vertex_idx_t<Graph_t>>(rep_subgraph_vertices_sorted.size()); ++j) {
                 auto sp_pair = std::make_pair(bsp_schedule.assignedSuperstep(j), bsp_schedule.assignedProcessor(j));
 
-                if (max_bsp) 
+                if (max_bsp)
                     sp_pair = std::make_pair(j, 0);
 
-
                 if (sp_proc_to_relative_partition.find(sp_pair) == sp_proc_to_relative_partition.end()) {
                     sp_proc_to_relative_partition[sp_pair] = num_partitions_per_subgraph++;
                 }
@@ -516,12 +512,12 @@ class IsomorphicSubgraphScheduler {
                 } else { // For other subgraphs, build the isomorphic mapping
                     Constr_Graph_t current_subgraph_graph;
                     create_induced_subgraph(instance.getComputationalDag(), current_subgraph_graph, current_subgraph_vertices_sorted);
-                    
+
                     MerkleHashComputer<Constr_Graph_t> current_hasher(current_subgraph_graph);
 
-                    for(const auto& [hash, rep_orbit_nodes] : rep_hasher.get_orbits()) {
-                        const auto& current_orbit_nodes = current_hasher.get_orbit_from_hash(hash);
-                        for(size_t k = 0; k < rep_orbit_nodes.size(); ++k) {
+                    for (const auto &[hash, rep_orbit_nodes] : rep_hasher.get_orbits()) {
+                        const auto &current_orbit_nodes = current_hasher.get_orbit_from_hash(hash);
+                        for (size_t k = 0; k < rep_orbit_nodes.size(); ++k) {
                             // Map: current_subgraph_vertex -> representative_subgraph_local_idx
                             current_vertex_to_rep_local_idx[current_subgraph_vertices_sorted[current_orbit_nodes[k]]] = static_cast<vertex_idx_t<Constr_Graph_t>>(rep_orbit_nodes[k]);
                         }
@@ -529,11 +525,11 @@ class IsomorphicSubgraphScheduler {
                 }
 
                 // Apply the partition pattern
-                for (const auto& current_vertex : current_subgraph_vertices_sorted) {
+                for (const auto &current_vertex : current_subgraph_vertices_sorted) {
                     const auto rep_local_idx = current_vertex_to_rep_local_idx.at(current_vertex);
                     auto sp_pair = std::make_pair(bsp_schedule.assignedSuperstep(rep_local_idx), bsp_schedule.assignedProcessor(rep_local_idx));
 
-                    if (max_bsp) 
+                    if (max_bsp)
                         sp_pair = std::make_pair(rep_local_idx, 0);
 
                     partition[current_vertex] = current_partition_idx + sp_proc_to_relative_partition.at(sp_pair);
@@ -544,4 +540,4 @@ class IsomorphicSubgraphScheduler {
     }
 };
 
-}
\ No newline at end of file
+} // namespace osp
\ No newline at end of file
diff --git a/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp b/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp
index 0b125e71..88dcf1fa 100644
--- a/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp
+++ b/include/osp/dag_divider/isomorphism_divider/TrimmedGroupScheduler.hpp
@@ -19,8 +19,8 @@ limitations under the License.
 #pragma once
 
 #include "osp/bsp/scheduler/Scheduler.hpp"
-#include "osp/graph_algorithms/subgraph_algorithms.hpp"
 #include "osp/graph_algorithms/computational_dag_util.hpp"
+#include "osp/graph_algorithms/subgraph_algorithms.hpp"
 #include <iostream>
 #include <numeric>
 
@@ -35,7 +35,7 @@ namespace osp {
  * potentially disconnected, subgraph that resulted from merging smaller isomorphic subgraphs. It divides
  * the input graph into its weakly connected components and schedules them on proportionally allocated processors.
  */
-template <typename Constr_Graph_t>
+template<typename Constr_Graph_t>
 class TrimmedGroupScheduler : public Scheduler<Constr_Graph_t> {
 
     Scheduler<Constr_Graph_t> *sub_scheduler;
@@ -94,7 +94,7 @@ class TrimmedGroupScheduler : public Scheduler<Constr_Graph_t> {
         // Determine the processor allocation for a single sub-problem.
         // Calculate offsets for processor types within the main 'arch' (passed to TrimmedGroupScheduler)
         std::vector<unsigned> arch_proc_type_offsets(arch.getNumberOfProcessorTypes(), 0);
-        const auto& arch_proc_type_counts = arch.getProcessorTypeCount();
+        const auto &arch_proc_type_counts = arch.getProcessorTypeCount();
         for (unsigned type_idx = 1; type_idx < arch.getNumberOfProcessorTypes(); ++type_idx) {
             arch_proc_type_offsets[type_idx] = arch_proc_type_offsets[type_idx - 1] + arch_proc_type_counts[type_idx - 1];
         }
@@ -115,12 +115,12 @@ class TrimmedGroupScheduler : public Scheduler<Constr_Graph_t> {
         }
 
         // Create the sub-architecture for one sub-problem.
-        BspArchitecture<Constr_Graph_t> sub_arch(arch);  
-        sub_arch.set_processors_consequ_types(sub_proc_counts, mem_weights);
+        BspArchitecture<Constr_Graph_t> sub_arch(arch);
+        sub_arch.SetProcessorsConsequTypes(sub_proc_counts, mem_weights);
 
         // Calculate offsets for processor types within the 'sub_arch'
         std::vector<unsigned> sub_arch_proc_type_offsets(sub_arch.getNumberOfProcessorTypes(), 0);
-        const auto& sub_arch_proc_type_counts = sub_arch.getProcessorTypeCount();
+        const auto &sub_arch_proc_type_counts = sub_arch.getProcessorTypeCount();
         for (unsigned type_idx = 1; type_idx < sub_arch.getNumberOfProcessorTypes(); ++type_idx) {
             sub_arch_proc_type_offsets[type_idx] = sub_arch_proc_type_offsets[type_idx - 1] + sub_arch_proc_type_counts[type_idx - 1];
         }
@@ -135,8 +135,8 @@ class TrimmedGroupScheduler : public Scheduler<Constr_Graph_t> {
             std::sort(group_vertices.begin(), group_vertices.end());
 
             BspInstance<Constr_Graph_t> sub_instanc;
-            sub_instanc.setArchitecture(sub_arch); // Set the sub-architecture
-            sub_instanc.setNodeProcessorCompatibility(instance.getNodeProcessorCompatibilityMatrix()); // Inherit compatibility
+            sub_instanc.setArchitecture(sub_arch);                                                                          // Set the sub-architecture
+            sub_instanc.setNodeProcessorCompatibility(instance.getNodeProcessorCompatibilityMatrix());                      // Inherit compatibility
             auto global_to_local_map = create_induced_subgraph_map(dag, sub_instanc.getComputationalDag(), group_vertices); // Create induced subgraph
 
             // Create a schedule object for the sub-problem
@@ -144,10 +144,11 @@ class TrimmedGroupScheduler : public Scheduler<Constr_Graph_t> {
 
             // Call the sub-scheduler to compute the schedule for this group of components
             auto status = sub_scheduler->computeSchedule(sub_schedule);
-            if (status != RETURN_STATUS::OSP_SUCCESS && status != RETURN_STATUS::BEST_FOUND) return status;
+            if (status != RETURN_STATUS::OSP_SUCCESS && status != RETURN_STATUS::BEST_FOUND)
+                return status;
 
             // Map the sub-schedule back to the main schedule.
-            for (const auto& v_global : group_vertices) {
+            for (const auto &v_global : group_vertices) {
                 const auto v_local = global_to_local_map.at(v_global);
                 const unsigned sub_proc = sub_schedule.assignedProcessor(v_local);
                 const unsigned sub_superstep = sub_schedule.assignedSuperstep(v_local);
diff --git a/include/osp/graph_algorithms/computational_dag_construction_util.hpp b/include/osp/graph_algorithms/computational_dag_construction_util.hpp
index e85217e9..553996a6 100644
--- a/include/osp/graph_algorithms/computational_dag_construction_util.hpp
+++ b/include/osp/graph_algorithms/computational_dag_construction_util.hpp
@@ -34,7 +34,7 @@ namespace osp {
  * @tparam Graph_to The type of the target graph. Must satisfy `is_constructable_cdag_vertex`.
  * @param from The source graph.
  * @param to The target graph.
- */    
+ */
 template<typename Graph_from, typename Graph_to>
 void constructComputationalDag(const Graph_from &from, Graph_to &to) {
     static_assert(is_computational_dag_v<Graph_from>, "Graph_from must satisfy the computational_dag concept");
@@ -46,21 +46,21 @@ void constructComputationalDag(const Graph_from &from, Graph_to &to) {
     for (const auto &v_idx : from.vertices()) {
         if constexpr (has_typed_vertices_v<Graph_from> and has_typed_vertices_v<Graph_to>) {
             vertex_map.push_back(to.add_vertex(from.vertex_work_weight(v_idx), from.vertex_comm_weight(v_idx),
-                          from.vertex_mem_weight(v_idx), from.vertex_type(v_idx)));
+                                               from.vertex_mem_weight(v_idx), from.vertex_type(v_idx)));
         } else {
             vertex_map.push_back(to.add_vertex(from.vertex_work_weight(v_idx), from.vertex_comm_weight(v_idx),
-                          from.vertex_mem_weight(v_idx)));
+                                               from.vertex_mem_weight(v_idx)));
         }
     }
 
     if constexpr (has_edge_weights_v<Graph_from> and has_edge_weights_v<Graph_to>) {
         for (const auto &e : edges(from)) {
-            to.add_edge(vertex_map.at(source(e, from)), vertex_map.at(target(e, from)), from.edge_comm_weight(e));
+            to.add_edge(vertex_map[source(e, from)], vertex_map[target(e, from)], from.edge_comm_weight(e));
         }
     } else {
         for (const auto &v : from.vertices()) {
             for (const auto &child : from.children(v)) {
-                to.add_edge(vertex_map.at(v), vertex_map.at(child));
+                to.add_edge(vertex_map[v], vertex_map[child]);
             }
         }
     }
diff --git a/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp b/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp
index 0b67ab30..616aea6b 100644
--- a/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp
+++ b/include/osp/graph_implementations/adj_list_impl/cdag_vertex_impl.hpp
@@ -17,6 +17,8 @@ limitations under the License.
 */
 #pragma once
 
+#include <cstddef> // for std::size_t
+
 namespace osp {
 
 /**
@@ -71,17 +73,17 @@ struct cdag_vertex_impl {
 };
 
 /**
- * @brief A vertex implementation with integer weights. Indexed by size_t. Node types are unsigned.
+ * @brief A vertex implementation with integer weights. Indexed by std::size_t. Node types are unsigned.
  *
  * This struct implements a vertex with integer weights for work, communication, and memory.
  */
-using cdag_vertex_impl_int = cdag_vertex_impl<size_t, int, int, int, unsigned>;
+using cdag_vertex_impl_int = cdag_vertex_impl<std::size_t, int, int, int, unsigned>;
 
 /**
- * @brief A vertex implementation with unsigned weights. Indexed by size_t. Node types are unsigned.
+ * @brief A vertex implementation with unsigned weights. Indexed by std::size_t. Node types are unsigned.
  *
  * This struct implements a vertex with unsigned weights for work, communication, and memory.
  */
-using cdag_vertex_impl_unsigned = cdag_vertex_impl<size_t, unsigned, unsigned, unsigned, unsigned>;
+using cdag_vertex_impl_unsigned = cdag_vertex_impl<std::size_t, unsigned, unsigned, unsigned, unsigned>;
 
 } // namespace osp
\ No newline at end of file
diff --git a/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp b/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp
index 74340de6..efe1996e 100644
--- a/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp
+++ b/include/osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp
@@ -32,8 +32,8 @@ namespace osp {
  * @brief A vector-based implementation of a computational DAG.
  *
  * This class implements a computational DAG using adjacency lists stored in two std::vectors.
- * It manages the storage of vertices and edges, and provides an interface to query and modify the graph. 
- * 
+ * It manages the storage of vertices and edges, and provides an interface to query and modify the graph.
+ *
  * This class satisfies the following concepts:
  * - `is_computational_dag_typed_vertices`
  * - `is_directed_graph`
@@ -80,7 +80,7 @@ class computational_dag_vector_impl {
           num_vertex_types_(0) {
 
         for (vertex_idx i = 0; i < num_vertices; ++i) {
-            vertices_.at(i).id = i;
+            vertices_[i].id = i;
         }
     }
 
@@ -150,40 +150,40 @@ class computational_dag_vector_impl {
     [[nodiscard]] vertex_idx num_edges() const { return num_edges_; }
 
     /**
-     * @brief Returns the parents (in-neighbors) of a vertex.
+     * @brief Returns the parents (in-neighbors) of a vertex. Does not perform bounds checking.
      * @param v The vertex index.
      */
-    [[nodiscard]] const std::vector<vertex_idx> &parents(const vertex_idx v) const { return in_neigbors.at(v); }
+    [[nodiscard]] const std::vector<vertex_idx> &parents(const vertex_idx v) const { return in_neigbors[v]; }
 
     /**
-     * @brief Returns the children (out-neighbors) of a vertex.
+     * @brief Returns the children (out-neighbors) of a vertex. Does not perform bounds checking.
      * @param v The vertex index.
      */
-    [[nodiscard]] const std::vector<vertex_idx> &children(const vertex_idx v) const { return out_neigbors.at(v); }
+    [[nodiscard]] const std::vector<vertex_idx> &children(const vertex_idx v) const { return out_neigbors[v]; }
 
     /**
-     * @brief Returns the in-degree of a vertex.
+     * @brief Returns the in-degree of a vertex. Does not perform bounds checking.
      * @param v The vertex index.
      */
-    [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast<vertex_idx>(in_neigbors.at(v).size()); }
+    [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast<vertex_idx>(in_neigbors[v].size()); }
 
     /**
-     * @brief Returns the out-degree of a vertex.
+     * @brief Returns the out-degree of a vertex. Does not perform bounds checking.
      * @param v The vertex index.
      */
-    [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast<vertex_idx>(out_neigbors.at(v).size()); }
+    [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast<vertex_idx>(out_neigbors[v].size()); }
 
-    [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_.at(v).work_weight; }
+    [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_[v].work_weight; }
 
-    [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_.at(v).comm_weight; }
+    [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_[v].comm_weight; }
 
-    [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_.at(v).mem_weight; }
+    [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_[v].mem_weight; }
 
-    [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_.at(v).vertex_type; }
+    [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_[v].vertex_type; }
 
     [[nodiscard]] vertex_type_type num_vertex_types() const { return num_vertex_types_; }
 
-    [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_.at(v); }
+    [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_[v]; }
 
     /**
      * @brief Adds a new isolated vertex to the graph.
@@ -240,7 +240,7 @@ class computational_dag_vector_impl {
             return false;
         }
 
-        out_neigbors.at(source).push_back(target);
+        out_neigbors[source].push_back(target);
         in_neigbors.at(target).push_back(source);
         num_edges_++;
 
@@ -267,7 +267,6 @@ using computational_dag_vector_impl_def_t = computational_dag_vector_impl<cdag_v
  */
 using computational_dag_vector_impl_def_int_t = computational_dag_vector_impl<cdag_vertex_impl_int>;
 
-
 static_assert(is_directed_graph_edge_desc_v<computational_dag_vector_impl<cdag_vertex_impl_unsigned>>,
               "computational_dag_vector_impl must satisfy the directed_graph_edge_desc concept");
 
diff --git a/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp b/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp
index 1deadcee..3ab94872 100644
--- a/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp
+++ b/include/osp/graph_implementations/adj_list_impl/dag_vector_adapter.hpp
@@ -80,8 +80,8 @@ class dag_vector_adapter {
     dag_vector_adapter(const std::vector<std::vector<index_t>> &out_neigbors_,
                        const std::vector<std::vector<index_t>> &in_neigbors_) : vertices_(out_neigbors_.size()), out_neigbors(&out_neigbors_), in_neigbors(&in_neigbors_), num_edges_(0), num_vertex_types_(1) {
         for (vertex_idx i = 0; i < static_cast<vertex_idx>(out_neigbors_.size()); ++i) {
-            vertices_.at(i).id = i;
-            num_edges_ += out_neigbors_.at(i).size();
+            vertices_[i].id = i;
+            num_edges_ += out_neigbors_[i].size();
         }
     }
 
@@ -107,8 +107,8 @@ class dag_vector_adapter {
 
         num_edges_ = 0;
         for (vertex_idx i = 0; i < static_cast<vertex_idx>(out_neigbors->size()); ++i) {
-            vertices_.at(i).id = i;
-            num_edges_ += out_neigbors->at(i).size();
+            vertices_[i].id = i;
+            num_edges_ += out_neigbors_[i].size();
         }
 
         num_vertex_types_ = 1;
@@ -130,40 +130,40 @@ class dag_vector_adapter {
     [[nodiscard]] vertex_idx num_edges() const { return static_cast<vertex_idx>(num_edges_); }
 
     /**
-     * @brief Returns a view of the parents (in-neighbors) of a vertex.
+     * @brief Returns a view of the parents (in-neighbors) of a vertex. Does not perform bounds checking.
      * @param v The vertex index.
      */
-    [[nodiscard]] auto parents(const vertex_idx v) const { return vector_cast_view<index_t, vertex_idx>(in_neigbors->at(v)); }
+    [[nodiscard]] auto parents(const vertex_idx v) const { return vector_cast_view<index_t, vertex_idx>((*in_neigbors)[v]); }
 
     /**
-     * @brief Returns a view of the children (out-neighbors) of a vertex.
+     * @brief Returns a view of the children (out-neighbors) of a vertex. Does not perform bounds checking.
      * @param v The vertex index.
      */
-    [[nodiscard]] auto children(const vertex_idx v) const { return vector_cast_view<index_t, vertex_idx>(out_neigbors->at(v)); }
+    [[nodiscard]] auto children(const vertex_idx v) const { return vector_cast_view<index_t, vertex_idx>((*out_neigbors)[v]); }
 
     /**
-     * @brief Returns the in-degree of a vertex.
+     * @brief Returns the in-degree of a vertex. Does not perform bounds checking.
      * @param v The vertex index.
      */
-    [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast<vertex_idx>(in_neigbors->at(v).size()); }
+    [[nodiscard]] vertex_idx in_degree(const vertex_idx v) const { return static_cast<vertex_idx>((*in_neigbors)[v].size()); }
 
     /**
-     * @brief Returns the out-degree of a vertex.
+     * @brief Returns the out-degree of a vertex. Does not perform bounds checking.
      * @param v The vertex index.
      */
-    [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast<vertex_idx>(out_neigbors->at(v).size()); }
+    [[nodiscard]] vertex_idx out_degree(const vertex_idx v) const { return static_cast<vertex_idx>((*out_neigbors)[v].size()); }
 
-    [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_.at(v).work_weight; }
+    [[nodiscard]] vertex_work_weight_type vertex_work_weight(const vertex_idx v) const { return vertices_[v].work_weight; }
 
-    [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_.at(v).comm_weight; }
+    [[nodiscard]] vertex_comm_weight_type vertex_comm_weight(const vertex_idx v) const { return vertices_[v].comm_weight; }
 
-    [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_.at(v).mem_weight; }
+    [[nodiscard]] vertex_mem_weight_type vertex_mem_weight(const vertex_idx v) const { return vertices_[v].mem_weight; }
 
-    [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_.at(v).vertex_type; }
+    [[nodiscard]] vertex_type_type vertex_type(const vertex_idx v) const { return vertices_[v].vertex_type; }
 
     [[nodiscard]] vertex_type_type num_vertex_types() const { return num_vertex_types_; }
 
-    [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_.at(v); }
+    [[nodiscard]] const v_impl &get_vertex_impl(const vertex_idx v) const { return vertices_[v]; }
 
     void set_vertex_work_weight(const vertex_idx v, const vertex_work_weight_type work_weight) {
         vertices_.at(v).work_weight = work_weight;
@@ -192,7 +192,6 @@ class dag_vector_adapter {
     unsigned num_vertex_types_ = 0;
 };
 
-
 static_assert(is_directed_graph_edge_desc_v<dag_vector_adapter<cdag_vertex_impl_unsigned, int>>,
               "dag_vector_adapter must satisfy the directed_graph_edge_desc concept");
 
diff --git a/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp b/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp
index e8fbe586..b42ea17d 100644
--- a/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp
+++ b/include/osp/graph_implementations/adj_list_impl/vector_cast_view.hpp
@@ -141,7 +141,7 @@ class vector_cast_view {
      * @param i The index of the element to access.
      * @return The element at index i, cast to to_t.
      */
-    [[nodiscard]] auto operator[](std::size_t i) const { return static_cast<to_t>(vec.at(i)); }
+    [[nodiscard]] auto operator[](std::size_t i) const { return static_cast<to_t>(vec[i]); }
 };
 
 } // namespace osp
\ No newline at end of file
diff --git a/tests/bsp_architecture.cpp b/tests/bsp_architecture.cpp
index af26e034..d803bb56 100644
--- a/tests/bsp_architecture.cpp
+++ b/tests/bsp_architecture.cpp
@@ -19,8 +19,8 @@ limitations under the License.
 #define BOOST_TEST_MODULE Bsp_Architecture
 #include <boost/test/unit_test.hpp>
 
-#include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp"
 #include "osp/bsp/model/BspArchitecture.hpp"
+#include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp"
 
 using namespace osp;
 
@@ -61,18 +61,18 @@ BOOST_AUTO_TEST_CASE(ParameterizedConstructorTest) {
 
     BOOST_CHECK_EQUAL(architecture.maxMemoryBoundProcType(0), 100);
 
-    BOOST_TEST(architecture.sendCostMatrix() == uniform_sent_costs);
+    BOOST_TEST(architecture.sendCost() == uniform_sent_costs);
 
     std::vector<std::vector<int>> expectedSendCosts = {{0, 2, 2, 2}, {2, 0, 2, 2}, {2, 2, 0, 2}, {2, 2, 2, 0}};
 
-    architecture.setSendCosts(expectedSendCosts);
-    BOOST_TEST(architecture.sendCostMatrix() == expectedSendCosts);
+    architecture.SetSendCosts(expectedSendCosts);
+    BOOST_TEST(architecture.sendCost() == expectedSendCosts);
 
     BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 1), 4);
     BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 0), 0);
 
     architecture.SetUniformSendCost();
-    BOOST_TEST(architecture.sendCostMatrix() == uniform_sent_costs);
+    BOOST_TEST(architecture.sendCost() == uniform_sent_costs);
 
     BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 1), 2);
     BOOST_CHECK_EQUAL(architecture.communicationCosts(0, 0), 0);
@@ -141,8 +141,7 @@ BOOST_AUTO_TEST_CASE(Architecture) {
     }
 
     // constructor
-    std::vector<std::vector<int>> send_costs = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1},
-                                                {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}};
+    std::vector<std::vector<int>> send_costs = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}};
 
     BOOST_CHECK_THROW(BspArchitecture<computational_dag_vector_impl_def_int_t> test31(7, 42942, 0, send_costs),
                       std::invalid_argument);
@@ -169,10 +168,8 @@ BOOST_AUTO_TEST_CASE(Architecture) {
     }
 
     // constructor
-    std::vector<std::vector<int>> send_costs2 = {{0, 1, 2, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1},
-                                                 {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}};
-    std::vector<std::vector<int>> send_costs3 = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1},
-                                                 {3, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}};
+    std::vector<std::vector<int>> send_costs2 = {{0, 1, 2, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, {1, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}};
+    std::vector<std::vector<int>> send_costs3 = {{0, 1, 1, 1, 1, 1}, {1, 0, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1}, {3, 1, 1, 0, 1, 1}, {1, 1, 1, 1, 0, 1}, {1, 1, 1, 1, 1, 0}};
 
     BspArchitecture<computational_dag_vector_impl_def_int_t> test4(6, 0, 4294965, send_costs2);
     BOOST_CHECK_EQUAL(test4.numberOfProcessors(), 6);
diff --git a/tests/bsp_instance.cpp b/tests/bsp_instance.cpp
index c2b0b02a..60e95999 100644
--- a/tests/bsp_instance.cpp
+++ b/tests/bsp_instance.cpp
@@ -19,12 +19,12 @@ limitations under the License.
 #define BOOST_TEST_MODULE Bsp_Architecture
 #include <boost/test/unit_test.hpp>
 
+#include "osp/auxiliary/io/arch_file_reader.hpp"
+#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
 #include "osp/bsp/model/BspInstance.hpp"
 #include "osp/bsp/model/BspSchedule.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp"
-#include "osp/auxiliary/io/arch_file_reader.hpp"
-#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
 #include <filesystem>
 #include <iostream>
 
@@ -84,8 +84,7 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) {
     BOOST_CHECK_EQUAL(instance.isCompatible(0, 0), true);
     BOOST_CHECK_EQUAL(instance.isCompatible(1, 0), false);
 
-
-    compatible_processor_range range(instance);
+    CompatibleProcessorRange range(instance);
 
     BOOST_CHECK_EQUAL(range.compatible_processors_type(0).size(), 3);
     BOOST_CHECK_EQUAL(range.compatible_processors_type(1).size(), 1);
@@ -97,7 +96,6 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) {
     }
     std::cout << std::endl;
 
-
     std::cout << "Compatible processors type 1: " << std::endl;
 
     for (const auto &p : range.compatible_processors_type(1)) {
@@ -105,7 +103,6 @@ BOOST_AUTO_TEST_CASE(test_instance_bicgstab) {
     }
     std::cout << std::endl;
 
-
     BOOST_CHECK_EQUAL(range.compatible_processors_vertex(0).size(), 1);
     BOOST_CHECK_EQUAL(range.compatible_processors_vertex(1).size(), 3);
     BOOST_CHECK_EQUAL(range.compatible_processors_vertex(2).size(), 3);
diff --git a/tests/debug_merkle_divider.cpp b/tests/debug_merkle_divider.cpp
index bf3bd1b5..5763d840 100644
--- a/tests/debug_merkle_divider.cpp
+++ b/tests/debug_merkle_divider.cpp
@@ -16,24 +16,23 @@ limitations under the License.
 @author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
 */
 
-#include <iostream>
-#include "osp/auxiliary/io/dot_graph_file_reader.hpp"
 #include "osp/auxiliary/io/DotFileWriter.hpp"
+#include "osp/auxiliary/io/dot_graph_file_reader.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/BspLocking.hpp"
-#include "osp/bsp/scheduler/Serial.hpp"
-#include "osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/GreedyChildren.hpp"
+#include "osp/bsp/scheduler/GreedySchedulers/GreedyMetaScheduler.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCores.hpp"
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp"
+#include "osp/bsp/scheduler/Serial.hpp"
 #include "osp/coarser/coarser_util.hpp"
 #include "osp/dag_divider/isomorphism_divider/IsomorphicSubgraphScheduler.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp"
+#include <iostream>
 
 using namespace osp;
 
-
 template<typename GraphT>
-void check_partition_type_homogeneity(const GraphT& dag, const std::vector<vertex_idx_t<GraphT>>& partition) {
+void check_partition_type_homogeneity(const GraphT &dag, const std::vector<vertex_idx_t<GraphT>> &partition) {
     // Group partitions by their ID
     std::map<vertex_idx_t<GraphT>, std::vector<vertex_idx_t<GraphT>>> partitions;
     for (vertex_idx_t<GraphT> i = 0; i < dag.num_vertices(); ++i) {
@@ -41,19 +40,20 @@ void check_partition_type_homogeneity(const GraphT& dag, const std::vector<verte
     }
 
     // For each partition, check that all vertices have the same type
-    for (const auto& [part_id, vertices] : partitions) {
-        if (vertices.empty()) continue;
+    for (const auto &[part_id, vertices] : partitions) {
+        if (vertices.empty())
+            continue;
         const auto first_node_type = dag.vertex_type(vertices[0]);
-        for (const auto& vertex : vertices) {
+        for (const auto &vertex : vertices) {
             if (dag.vertex_type(vertex) != first_node_type) {
                 std::cerr << "Partition " << part_id << " contains vertices with different types." << std::endl;
                 return;
-            } 
+            }
         }
     }
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
     if (argc < 2) {
         std::cerr << "Usage: " << argv[0] << " <path_to_dot_file>" << std::endl;
         return 1;
@@ -76,15 +76,12 @@ int main(int argc, char* argv[]) {
         instance.getComputationalDag().set_vertex_comm_weight(v, static_cast<v_commw_t<graph_t2>>(instance.getComputationalDag().vertex_comm_weight(v) * 0.01));
     }
 
-
     // Set up architecture
-    instance.getArchitecture().set_processors_consequ_types({24,48},{100,100});
+    instance.getArchitecture().SetProcessorsConsequTypes({24, 48}, {100, 100});
     instance.setDiagonalCompatibilityMatrix(2);
     instance.setSynchronisationCosts(2000);
     instance.setCommunicationCosts(1);
 
-    
-
     // Set up the scheduler
     GrowLocalAutoCores<graph_t> growlocal;
     BspLocking<graph_t> locking;
@@ -95,9 +92,9 @@ int main(int argc, char* argv[]) {
     ComboScheduler<graph_t> growlocal_kl(growlocal, kl);
     ComboScheduler<graph_t> locking_kl(locking, kl);
     ComboScheduler<graph_t> children_kl(children, kl);
- 
+
     GreedyMetaScheduler<graph_t> scheduler;
-    //scheduler.addScheduler(growlocal_kl);
+    // scheduler.addScheduler(growlocal_kl);
     scheduler.addScheduler(locking_kl);
     scheduler.addScheduler(children_kl);
     scheduler.addSerialScheduler();
@@ -120,7 +117,7 @@ int main(int argc, char* argv[]) {
     graph_t corase_graph;
     coarser_util::construct_coarse_dag(instance.getComputationalDag(), corase_graph, partition);
     bool acyc = is_acyclic(corase_graph);
-    std::cout << "Partition is " << (acyc ? "acyclic." : "not acyclic."); 
+    std::cout << "Partition is " << (acyc ? "acyclic." : "not acyclic.");
 
     std::cout << "Partition computation finished." << std::endl;
     std::cout << "Generated " << std::set<vertex_idx_t<graph_t>>(partition.begin(), partition.end()).size() << " partitions." << std::endl;
diff --git a/tests/kl_bsp_improver_test.cpp b/tests/kl_bsp_improver_test.cpp
index df3ac3f1..6e1611ec 100644
--- a/tests/kl_bsp_improver_test.cpp
+++ b/tests/kl_bsp_improver_test.cpp
@@ -152,7 +152,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) {
 
 //         std::vector<std::vector<int>> send_cost = {{0, 1, 4, 4}, {1, 0, 4, 4}, {4, 4, 0, 1}, {4, 4, 1, 0}};
 
-//         instance.getArchitecture().setSendCosts(send_cost);
+//         instance.getArchitecture().SetSendCosts(send_cost);
 
 //         if (!status_graph) {
 
diff --git a/tests/kl_lambda.cpp b/tests/kl_lambda.cpp
index a7f40cf4..31f86130 100644
--- a/tests/kl_lambda.cpp
+++ b/tests/kl_lambda.cpp
@@ -25,14 +25,14 @@ limitations under the License.
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin/kl_total_comm.hpp"
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin/kl_total_cut.hpp"
 
+#include "osp/auxiliary/io/arch_file_reader.hpp"
+#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp"
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp"
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp"
-#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp"
-#include "osp/auxiliary/io/arch_file_reader.hpp"
-#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
-#include "test_graphs.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
+#include "test_graphs.hpp"
 
 using namespace osp;
 
@@ -56,37 +56,38 @@ void add_node_types(Graph_t &dag) {
 
     for (const auto &v : dag.vertices()) {
         dag.set_vertex_type(v, node_type++ % 2);
-    }    
+    }
 }
 
 template<typename table_t>
-void check_equal_affinity_table(table_t & table_1, table_t & table_2, const std::set<size_t> & nodes) {
+void check_equal_affinity_table(table_t &table_1, table_t &table_2, const std::set<size_t> &nodes) {
 
-    for ( auto i : nodes) {
+    for (auto i : nodes) {
         BOOST_CHECK_EQUAL(table_1[i].size(), table_2[i].size());
-        if (table_1[i].size() != table_2[i].size()) continue;
+        if (table_1[i].size() != table_2[i].size())
+            continue;
         for (size_t j = 0; j < table_1[i].size(); ++j) {
             BOOST_CHECK_EQUAL(table_1[i][j].size(), table_2[i][j].size());
-            if (table_1[i][j].size() != table_2[i][j].size()) continue;
+            if (table_1[i][j].size() != table_2[i][j].size())
+                continue;
             for (size_t k = 0; k < table_1[i][j].size(); ++k) {
                 BOOST_CHECK(std::abs(table_1[i][j][k] - table_2[i][j][k]) < 0.000001);
 
-                if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) {                   
-                    std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl;                   
-
+                if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) {
+                    std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl;
                 }
             }
         }
     }
 }
 
-void check_equal_lambda_map(const std::vector<std::map<unsigned,unsigned>> & map_1, const std::vector<std::map<unsigned,unsigned>> & map_2) {
+void check_equal_lambda_map(const std::vector<std::map<unsigned, unsigned>> &map_1, const std::vector<std::map<unsigned, unsigned>> &map_2) {
     BOOST_CHECK_EQUAL(map_1.size(), map_2.size());
     if (map_1.size() != map_2.size())
         return;
 
     for (size_t i = 0; i < map_1.size(); ++i) {
-        for (const auto & [key, value] : map_1[i]) {
+        for (const auto &[key, value] : map_1[i]) {
             BOOST_CHECK_EQUAL(value, map_2[i].at(key));
 
             if (value != map_2[i].at(key)) {
@@ -117,7 +118,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_with_node_types_test) {
         BspInstance<graph> instance;
 
         bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(),
-                                                                            instance.getComputationalDag());
+                                                                              instance.getComputationalDag());
 
         instance.getArchitecture().setSynchronisationCosts(5);
         instance.getArchitecture().setCommunicationCosts(5);
@@ -134,7 +135,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_with_node_types_test) {
         add_mem_weights(instance.getComputationalDag());
         add_node_types(instance.getComputationalDag());
 
-        instance.getArchitecture().setProcessorsWithTypes({0,0,1,1});
+        instance.getArchitecture().setProcessorsWithTypes({0, 0, 1, 1});
 
         instance.setDiagonalCompatibilityMatrix(2);
 
@@ -147,18 +148,15 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_with_node_types_test) {
         BOOST_CHECK(schedule.satisfiesNodeTypeConstraints());
 
         kl_total_lambda_comm_improver<graph, no_local_search_memory_constraint, 1> kl;
-        
+
         auto status = kl.improveSchedule(schedule);
 
         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
         BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
         BOOST_CHECK(schedule.satisfiesNodeTypeConstraints());
-        
     }
 }
 
-
-
 BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) {
 
     std::vector<std::string> filenames_graph = test_graphs();
@@ -180,7 +178,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) {
         BspInstance<graph> instance;
 
         bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(),
-                                                                            instance.getComputationalDag());
+                                                                              instance.getComputationalDag());
 
         instance.getArchitecture().setSynchronisationCosts(5);
         instance.getArchitecture().setCommunicationCosts(5);
@@ -204,7 +202,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) {
         BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
 
         kl_total_lambda_comm_improver<graph, no_local_search_memory_constraint, 1> kl;
-        
+
         auto status = kl.improveSchedule(schedule);
 
         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
@@ -252,11 +250,11 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) {
 //     schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3});
 
 //     schedule.updateNumberOfSupersteps();
-    
-//     using cost_f = kl_hyper_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1>;  
+
+//     using cost_f = kl_hyper_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1>;
 //     using kl_improver_test = kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double>;
 //     kl_improver_test kl;
-    
+
 //     kl.setup_schedule(schedule);
 
 //     auto &kl_active_schedule = kl.get_active_schedule();
@@ -269,7 +267,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) {
 //     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0);
 //     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0);
 //     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0);
-        
+
 //     BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4);
 //     BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true);
 
@@ -369,7 +367,6 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) {
 //     using graph = computational_dag_edge_idx_vector_impl_def_int_t;
 //     using VertexType = graph::vertex_idx;
 //     using kl_move = kl_move_struct<double, VertexType>;
-    
 
 //     graph dag;
 
@@ -401,11 +398,11 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) {
 //     schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3});
 
 //     schedule.updateNumberOfSupersteps();
-    
-//     using cost_f = kl_hyper_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1>; 
+
+//     using cost_f = kl_hyper_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1>;
 //     using kl_improver_test = kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double>;
 //     kl_improver_test kl;
-    
+
 //     kl.setup_schedule(schedule);
 
 //     auto &kl_active_schedule = kl.get_active_schedule();
@@ -418,7 +415,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_on_test_graphs) {
 //     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0);
 //     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0);
 //     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0);
-        
+
 //      auto node_selection = kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7});
 
 //     std::set<VertexType> nodes_to_check = {0, 1, 2, 3, 4, 5, 6, 7};
@@ -533,11 +530,10 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_penalty_test) {
 
     schedule.updateNumberOfSupersteps();
 
-    
-    using cost_f = kl_hyper_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1>; 
+    using cost_f = kl_hyper_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1>;
     using kl_improver_test = kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double>;
     kl_improver_test kl;
-    
+
     kl.setup_schedule(schedule);
 
     auto &kl_active_schedule = kl.get_active_schedule();
@@ -550,48 +546,47 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_penalty_test) {
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0);
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0);
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0);
-        
+
     BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4);
     BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true);
 
-    auto node_selection = kl.insert_gain_heap_test_penalty({2,3});
+    auto node_selection = kl.insert_gain_heap_test_penalty({2, 3});
 
     auto recompute_max_gain = kl.run_inner_iteration_test(); // best move 3
-    std::cout << "------------------------recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "------------------------recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
-    std::cout << "}" << std::endl; 
+    }
+    std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
     recompute_max_gain = kl.run_inner_iteration_test(); // best move 0
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
     recompute_max_gain = kl.run_inner_iteration_test(); // best move 1
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
-   
+
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
-
 }
 
 BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
@@ -629,27 +624,27 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
     schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3});
 
     schedule.updateNumberOfSupersteps();
-    
-    using cost_f = kl_hyper_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1>; 
+
+    using cost_f = kl_hyper_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1>;
     using kl_improver_test = kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double>;
     kl_improver_test kl;
-    
+
     kl.setup_schedule(schedule);
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
-    auto node_selection = kl.insert_gain_heap_test_penalty({7}); 
+    auto node_selection = kl.insert_gain_heap_test_penalty({7});
 
     auto recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "-----------recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "-----------recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
-    std::cout << "}" << std::endl; 
+    }
+    std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
-        
-    auto& lambda_map = kl.get_comm_cost_f().node_lambda_map;
+
+    auto &lambda_map = kl.get_comm_cost_f().node_lambda_map;
 
     BOOST_CHECK(lambda_map.get_proc_entry(v1, 0) == 2);
     BOOST_CHECK(lambda_map.get_proc_entry(v1, 1) == 1);
@@ -669,32 +664,31 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
     BOOST_CHECK(lambda_map.has_no_proc_entry(v8, 0));
 
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
-
 }
 
 // BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs) {
@@ -708,7 +702,6 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
 //         std::cout << cwd << std::endl;
 //     }
 
-   
 //     for (auto &filename_graph : filenames_graph) {
 //         GreedyBspScheduler<computational_dag_edge_idx_vector_impl_def_int_t> test_scheduler;
 //         BspInstance<graph> instance;
@@ -724,7 +717,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
 //                                                    {4,4,0,1},
 //                                                    {4,4,1,0}};
 
-//         instance.getArchitecture().setSendCosts(send_cost);
+//         instance.getArchitecture().SetSendCosts(send_cost);
 
 //         if (!status_graph) {
 
@@ -752,7 +745,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
 //         auto status = kl.improveSchedule(schedule);
 //         auto finish_time = std::chrono::high_resolution_clock::now();
 //         auto duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
-        
+
 //         std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalLambdaCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl;
 
 //         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
@@ -763,18 +756,17 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
 //         // start_time = std::chrono::high_resolution_clock::now();
 //         // status = kl_old.improve_schedule_test_2(schedule_2);
 //         // finish_time = std::chrono::high_resolution_clock::now();
-        
+
 //         // duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
 
 //         // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl;
-        
+
 //         // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
 //         // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true);
 
 //     }
 // }
 
-
 // BOOST_AUTO_TEST_CASE(kl_lambda_total_comm_large_test_graphs_mt) {
 //     std::vector<std::string> filenames_graph = large_spaa_graphs();
 //     using graph = computational_dag_edge_idx_vector_impl_def_int_t;
@@ -786,7 +778,6 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
 //         std::cout << cwd << std::endl;
 //     }
 
-   
 //     for (auto &filename_graph : filenames_graph) {
 //         GreedyBspScheduler<computational_dag_edge_idx_vector_impl_def_int_t> test_scheduler;
 //         BspInstance<graph> instance;
@@ -802,7 +793,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
 //                                                    {4,4,0,1},
 //                                                    {4,4,1,0}};
 
-//         instance.getArchitecture().setSendCosts(send_cost);
+//         instance.getArchitecture().SetSendCosts(send_cost);
 
 //         if (!status_graph) {
 
@@ -830,7 +821,7 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
 //         auto status = kl.improveSchedule(schedule);
 //         auto finish_time = std::chrono::high_resolution_clock::now();
 //         auto duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
-        
+
 //         std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalLambdaCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl;
 
 //         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
@@ -841,11 +832,11 @@ BOOST_AUTO_TEST_CASE(kl_lambda_improver_inner_loop_lambda_map_test) {
 //         // start_time = std::chrono::high_resolution_clock::now();
 //         // status = kl_old.improve_schedule_test_2(schedule_2);
 //         // finish_time = std::chrono::high_resolution_clock::now();
-        
+
 //         // duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
 
 //         // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl;
-        
+
 //         // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
 //         // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true);
 
diff --git a/tests/kl_total.cpp b/tests/kl_total.cpp
index 5d3d1486..58421144 100644
--- a/tests/kl_total.cpp
+++ b/tests/kl_total.cpp
@@ -22,18 +22,17 @@ limitations under the License.
 
 #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp"
 
+#include "osp/auxiliary/io/arch_file_reader.hpp"
+#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp"
+#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp"
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include.hpp"
 #include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_include_mt.hpp"
-#include "osp/bsp/scheduler/LocalSearch/KernighanLin_v2/kl_improver_test.hpp"
-#include "osp/auxiliary/io/arch_file_reader.hpp"
-#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
-#include "test_graphs.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
+#include "test_graphs.hpp"
 
 using namespace osp;
 
-
 template<typename Graph_t>
 void add_mem_weights(Graph_t &dag) {
 
@@ -49,17 +48,16 @@ void add_mem_weights(Graph_t &dag) {
 }
 
 template<typename table_t>
-void check_equal_affinity_table(table_t & table_1, table_t & table_2, const std::set<size_t> & nodes) {
+void check_equal_affinity_table(table_t &table_1, table_t &table_2, const std::set<size_t> &nodes) {
     BOOST_CHECK_EQUAL(table_1.size(), table_2.size());
 
-    for ( auto i : nodes) {
+    for (auto i : nodes) {
         for (size_t j = 0; j < table_1[i].size(); ++j) {
             for (size_t k = 0; k < table_1[i][j].size(); ++k) {
                 BOOST_CHECK(std::abs(table_1[i][j][k] - table_2[i][j][k]) < 0.000001);
 
-                if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) {                   
-                    std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl;                   
-
+                if (std::abs(table_1[i][j][k] - table_2[i][j][k]) > 0.000001) {
+                    std::cout << "Mismatch at [" << i << "][" << j << "][" << k << "]: table_1=" << table_1[i][j][k] << ", table_2=" << table_2[i][j][k] << std::endl;
                 }
             }
         }
@@ -102,16 +100,13 @@ BOOST_AUTO_TEST_CASE(kl_improver_smoke_test) {
 
     schedule.updateNumberOfSupersteps();
 
-    
     using kl_improver_t = kl_total_comm_improver<graph, no_local_search_memory_constraint, 1, true>;
     kl_improver_t kl;
-    
-          
+
     auto status = kl.improveSchedule(schedule);
 
     BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
     BOOST_CHECK_EQUAL(schedule.satisfiesPrecedenceConstraints(), true);
-
 }
 
 BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) {
@@ -135,7 +130,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) {
         BspInstance<graph> instance;
 
         bool status_graph = file_reader::readComputationalDagHyperdagFormatDB((cwd / filename_graph).string(),
-                                                                            instance.getComputationalDag());
+                                                                              instance.getComputationalDag());
 
         instance.getArchitecture().setSynchronisationCosts(5);
         instance.getArchitecture().setCommunicationCosts(5);
@@ -147,7 +142,6 @@ BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) {
             BOOST_CHECK(false);
         }
 
-
         add_mem_weights(instance.getComputationalDag());
 
         BspSchedule<graph> schedule(instance);
@@ -158,7 +152,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_on_test_graphs) {
         BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
 
         kl_total_comm_improver<graph> kl;
-        
+
         auto status = kl.improveSchedule(schedule);
 
         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
@@ -170,7 +164,7 @@ BOOST_AUTO_TEST_CASE(kl_improver_superstep_removal_test) {
 
     using graph = computational_dag_edge_idx_vector_impl_def_int_t;
     using VertexType = graph::vertex_idx;
-    
+
     graph dag;
 
     const VertexType v1 = dag.add_vertex(2, 9, 2);
@@ -200,13 +194,13 @@ BOOST_AUTO_TEST_CASE(kl_improver_superstep_removal_test) {
     // Create a schedule with an almost empty superstep (step 1)
     schedule.setAssignedProcessors({0, 0, 0, 0, 1, 1, 1, 1});
     schedule.setAssignedSupersteps({0, 0, 0, 0, 1, 2, 2, 2});
-    
+
     schedule.updateNumberOfSupersteps();
     unsigned original_steps = schedule.numberOfSupersteps();
-    
-    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>; 
+
+    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>;
     kl_improver<graph, cost_f, no_local_search_memory_constraint, 1, double> kl;
-    
+
     auto status = kl.improveSchedule(schedule);
 
     BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
@@ -250,11 +244,10 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) {
 
     schedule.updateNumberOfSupersteps();
 
-    
-    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>; 
+    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>;
     using kl_improver_test = kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double>;
     kl_improver_test kl;
-    
+
     kl.setup_schedule(schedule);
 
     auto &kl_active_schedule = kl.get_active_schedule();
@@ -267,13 +260,13 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) {
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0);
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0);
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0);
-        
+
     BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4);
     BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true);
 
-    auto node_selection = kl.insert_gain_heap_test_penalty({2,3});
+    auto node_selection = kl.insert_gain_heap_test_penalty({2, 3});
 
-    auto& affinity = kl.get_affinity_table();
+    auto &affinity = kl.get_affinity_table();
 
     BOOST_CHECK_CLOSE(affinity[v3][0][0], 5.5, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v3][0][1], 4.0, 0.00001);
@@ -290,41 +283,40 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_test) {
     BOOST_CHECK_CLOSE(affinity[v4][1][2], -3.5, 0.00001);
 
     auto recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "------------------------recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "------------------------recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
-    std::cout << "}" << std::endl; 
+    }
+    std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
-        
+
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
-
 }
 
 BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_penalty_test) {
@@ -363,55 +355,53 @@ BOOST_AUTO_TEST_CASE(kl_improver_inner_loop_penalty_test) {
 
     schedule.updateNumberOfSupersteps();
 
-    
-    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>; 
+    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>;
     using kl_improver_test = kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double>;
     kl_improver_test kl;
-    
+
     kl.setup_schedule(schedule);
 
-    //auto &kl_active_schedule = kl.get_active_schedule();
+    // auto &kl_active_schedule = kl.get_active_schedule();
 
     BOOST_CHECK_CLOSE(51.5, kl.get_current_cost(), 0.00001);
 
-    auto node_selection = kl.insert_gain_heap_test_penalty({7}); 
+    auto node_selection = kl.insert_gain_heap_test_penalty({7});
 
     auto recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "-----------recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "-----------recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
-    std::cout << "}" << std::endl; 
+    }
+    std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
-        
+
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
 
     recompute_max_gain = kl.run_inner_iteration_test();
-    std::cout << "recompute max_gain: { "; 
-    for (const auto & [key, value] : recompute_max_gain) {
+    std::cout << "recompute max_gain: { ";
+    for (const auto &[key, value] : recompute_max_gain) {
         std::cout << key << " ";
-    }                
+    }
     std::cout << "}" << std::endl;
 
     BOOST_CHECK_CLOSE(kl.get_comm_cost_f().compute_schedule_cost_test(), kl.get_current_cost(), 0.00001);
-
 }
 
 BOOST_AUTO_TEST_CASE(kl_improver_violation_handling_test) {
@@ -450,16 +440,15 @@ BOOST_AUTO_TEST_CASE(kl_improver_violation_handling_test) {
 
     schedule.updateNumberOfSupersteps();
 
-    
-    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>; 
+    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>;
     kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double> kl;
-    
+
     kl.setup_schedule(schedule);
 
     kl.compute_violations_test();
 
     BOOST_CHECK_EQUAL(kl.is_feasible(), false);
-  
+
     kl_improver<graph, cost_f, no_local_search_memory_constraint, 1, double> kl_improver;
     kl_improver.improveSchedule(schedule);
 
@@ -502,10 +491,9 @@ BOOST_AUTO_TEST_CASE(kl_base_1) {
 
     schedule.updateNumberOfSupersteps();
 
-    
-    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>; 
+    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>;
     kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double> kl;
-    
+
     kl.setup_schedule(schedule);
 
     auto &kl_active_schedule = kl.get_active_schedule();
@@ -529,11 +517,11 @@ BOOST_AUTO_TEST_CASE(kl_base_1) {
     BOOST_CHECK_EQUAL(kl.is_feasible(), false);
     BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001);
 
-    kl_move move_2(v2, 3.0 + 4.5 - 4.0 , 0, 0, 1, 0);
+    kl_move move_2(v2, 3.0 + 4.5 - 4.0, 0, 0, 1, 0);
 
     kl.apply_move_test(move_2);
 
-    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(0), 39.0); // 42-3
+    BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(0), 39.0);       // 42-3
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(0), 5.0); // 2+3
     BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 1);
     BOOST_CHECK_EQUAL(kl.is_feasible(), false);
@@ -541,7 +529,7 @@ BOOST_AUTO_TEST_CASE(kl_base_1) {
 
     kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7});
 
-    auto& affinity = kl.get_affinity_table();
+    auto &affinity = kl.get_affinity_table();
 
     BOOST_CHECK_CLOSE(affinity[v1][0][1], 2.0 - 4.5, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v1][1][1], 0.0, 0.00001);
@@ -598,10 +586,9 @@ BOOST_AUTO_TEST_CASE(kl_base_2) {
 
     schedule.updateNumberOfSupersteps();
 
-    
-    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>; 
+    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>;
     kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double> kl;
-    
+
     kl.setup_schedule(schedule);
 
     auto &kl_active_schedule = kl.get_active_schedule();
@@ -614,7 +601,7 @@ BOOST_AUTO_TEST_CASE(kl_base_2) {
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0);
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0);
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0);
-    
+
     BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4);
     BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001);
     BOOST_CHECK_EQUAL(kl.is_feasible(), true);
@@ -636,7 +623,7 @@ BOOST_AUTO_TEST_CASE(kl_base_2) {
     BOOST_CHECK_EQUAL(kl.is_feasible(), true);
     BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001);
 
-    kl_move move_2(v2, -1.0 - 8.5 , 1, 1, 0, 0);
+    kl_move move_2(v2, -1.0 - 8.5, 1, 1, 0, 0);
 
     kl.apply_move_test(move_2);
 
@@ -652,7 +639,7 @@ BOOST_AUTO_TEST_CASE(kl_base_2) {
     BOOST_CHECK_EQUAL(kl.is_feasible(), false);
     BOOST_CHECK_CLOSE(kl.get_current_cost(), kl.get_comm_cost_f().compute_schedule_cost(), 0.00001);
 
-    kl_move move_x(v2, -2.0 + 8.5 , 0, 0, 1, 0);
+    kl_move move_x(v2, -2.0 + 8.5, 0, 0, 1, 0);
 
     kl.apply_move_test(move_x);
 
@@ -670,14 +657,13 @@ BOOST_AUTO_TEST_CASE(kl_base_2) {
 
     kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7});
 
-    auto& affinity = kl.get_affinity_table();
+    auto &affinity = kl.get_affinity_table();
 
     BOOST_CHECK_CLOSE(affinity[v1][0][1], -4.5, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v1][0][2], -2.5, 0.00001);
 
     BOOST_CHECK_CLOSE(affinity[v1][1][1], 2.0, 0.00001);
-    BOOST_CHECK_CLOSE(affinity[v1][1][2], 0.0, 0.00001); 
-
+    BOOST_CHECK_CLOSE(affinity[v1][1][2], 0.0, 0.00001);
 
     BOOST_CHECK_CLOSE(affinity[v2][0][1], 9.5, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v2][0][2], 11.5, 0.00001);
@@ -719,7 +705,6 @@ BOOST_AUTO_TEST_CASE(kl_base_2) {
     BOOST_CHECK_CLOSE(affinity[v7][1][0], 7.0, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v7][1][1], 8.0, 0.00001);
 
-
     BOOST_CHECK_CLOSE(affinity[v8][0][0], 8.5, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v8][0][1], 8.5, 0.00001);
 
@@ -763,10 +748,9 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 
     schedule.updateNumberOfSupersteps();
 
-    
-    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>; 
+    using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>;
     kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double> kl;
-    
+
     kl.setup_schedule(schedule);
 
     auto &kl_active_schedule = kl.get_active_schedule();
@@ -779,21 +763,19 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(2), 6.0);
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_max_work(3), 9.0);
     BOOST_CHECK_EQUAL(kl_active_schedule.work_datastructures.step_second_max_work(3), 8.0);
-        
+
     BOOST_CHECK_EQUAL(kl_active_schedule.num_steps(), 4);
     BOOST_CHECK_EQUAL(kl_active_schedule.is_feasible(), true);
 
     kl.insert_gain_heap_test_penalty({0, 1, 2, 3, 4, 5, 6, 7});
 
-    auto& affinity = kl.get_affinity_table();
-
+    auto &affinity = kl.get_affinity_table();
 
     BOOST_CHECK_CLOSE(affinity[v1][0][1], 1.0, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v1][0][2], 3.0, 0.00001);
 
     BOOST_CHECK_CLOSE(affinity[v1][1][1], 2.0, 0.00001);
-    BOOST_CHECK_CLOSE(affinity[v1][1][2], 16.5, 0.00001); 
-
+    BOOST_CHECK_CLOSE(affinity[v1][1][2], 16.5, 0.00001);
 
     BOOST_CHECK_CLOSE(affinity[v2][0][1], 15, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v2][0][2], 11.5, 0.00001);
@@ -835,16 +817,13 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
     BOOST_CHECK_CLOSE(affinity[v7][1][0], 7.0, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v7][1][1], 8.0, 0.00001);
 
-    
     BOOST_CHECK_CLOSE(affinity[v8][0][0], 14.0, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v8][0][1], 8.5, 0.00001);
 
     BOOST_CHECK_CLOSE(affinity[v8][1][0], 8.0, 0.00001);
     BOOST_CHECK_CLOSE(affinity[v8][1][1], 1.0, 0.00001);
-
 }
 
-
 // BOOST_AUTO_TEST_CASE(kl_improver_incremental_update_test) {
 
 //     using graph = computational_dag_edge_idx_vector_impl_def_int_t;
@@ -881,12 +860,11 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 //     schedule.setAssignedSupersteps({0, 0, 1, 1, 2, 2, 3, 3});
 
 //     schedule.updateNumberOfSupersteps();
-    
-    
-//     using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>; 
+
+//     using cost_f = kl_total_comm_cost_function<graph, double, no_local_search_memory_constraint, 1, true>;
 //     using kl_improver_test = kl_improver_test<graph, cost_f, no_local_search_memory_constraint, 1, double>;
 //     kl_improver_test kl;
-    
+
 //     kl.setup_schedule(schedule);
 
 //     auto node_selection = kl.insert_gain_heap_test({0, 1, 2, 3, 4, 5, 6, 7});
@@ -974,7 +952,6 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 
 // };
 
-
 // BOOST_AUTO_TEST_CASE(kl_total_comm_large_test_graphs) {
 //     std::vector<std::string> filenames_graph = large_spaa_graphs();
 //     using graph = computational_dag_edge_idx_vector_impl_def_int_t;
@@ -987,7 +964,6 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 //         std::cout << cwd << std::endl;
 //     }
 
-   
 //     for (auto &filename_graph : filenames_graph) {
 //         GreedyBspScheduler<computational_dag_edge_idx_vector_impl_def_int_t> test_scheduler;
 //         BspInstance<graph> instance;
@@ -1003,7 +979,7 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 //                                                    {4,4,0,1},
 //                                                    {4,4,1,0}};
 
-//         instance.getArchitecture().setSendCosts(send_cost);
+//         instance.getArchitecture().SetSendCosts(send_cost);
 
 //         if (!status_graph) {
 
@@ -1031,9 +1007,9 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 //         auto start_time = std::chrono::high_resolution_clock::now();
 //         auto status = kl.improveSchedule(schedule);
 //         auto finish_time = std::chrono::high_resolution_clock::now();
-        
+
 //         auto duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
-        
+
 //         std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl;
 
 //         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
@@ -1044,18 +1020,17 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 //         // start_time = std::chrono::high_resolution_clock::now();
 //         // status = kl_old.improve_schedule_test_2(schedule_2);
 //         // finish_time = std::chrono::high_resolution_clock::now();
-        
+
 //         // duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
 
 //         // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl;
-        
+
 //         // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
 //         // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true);
 
 //     }
 // }
 
-
 // BOOST_AUTO_TEST_CASE(kl_total_comm_large_test_graphs_mt) {
 //     std::vector<std::string> filenames_graph = large_spaa_graphs();
 //     using graph = computational_dag_edge_idx_vector_impl_def_int_t;
@@ -1068,7 +1043,6 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 //         std::cout << cwd << std::endl;
 //     }
 
-   
 //     for (auto &filename_graph : filenames_graph) {
 //         GreedyBspScheduler<computational_dag_edge_idx_vector_impl_def_int_t> test_scheduler;
 //         BspInstance<graph> instance;
@@ -1084,7 +1058,7 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 //                                                    {4,4,0,1},
 //                                                    {4,4,1,0}};
 
-//         instance.getArchitecture().setSendCosts(send_cost);
+//         instance.getArchitecture().SetSendCosts(send_cost);
 
 //         if (!status_graph) {
 
@@ -1112,9 +1086,9 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 //         auto start_time = std::chrono::high_resolution_clock::now();
 //         auto status = kl.improveSchedule(schedule);
 //         auto finish_time = std::chrono::high_resolution_clock::now();
-        
+
 //         auto duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
-        
+
 //         std::cout << "kl new finished in " << duration << " seconds, costs: " << schedule.computeTotalCosts() << " with " << schedule.numberOfSupersteps() << " number of supersteps"<< std::endl;
 
 //         BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
@@ -1125,11 +1099,11 @@ BOOST_AUTO_TEST_CASE(kl_base_3) {
 //         // start_time = std::chrono::high_resolution_clock::now();
 //         // status = kl_old.improve_schedule_test_2(schedule_2);
 //         // finish_time = std::chrono::high_resolution_clock::now();
-        
+
 //         // duration = std::chrono::duration_cast<std::chrono::seconds>(finish_time - start_time).count();
 
 //         // std::cout << "kl old finished in " << duration << " seconds, costs: " << schedule_2.computeTotalCosts() << " with " << schedule_2.numberOfSupersteps() << " number of supersteps"<< std::endl;
-        
+
 //         // BOOST_CHECK(status == RETURN_STATUS::OSP_SUCCESS || status == RETURN_STATUS::BEST_FOUND);
 //         // BOOST_CHECK_EQUAL(schedule_2.satisfiesPrecedenceConstraints(), true);