From 614d8d6f9a48821764e8ed8fefa6e23811f7d20d Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 8 Oct 2025 17:23:08 +0200
Subject: [PATCH 1/6] Base version of VarianceSSP scheduler (needs revisions)

---
 .../GreedyVarianceSspScheduler.hpp            | 592 ++++++++++++++++++
 tests/bsp_schedulers.cpp                      |   7 +
 2 files changed, 599 insertions(+)
 create mode 100644 include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
new file mode 100644
index 00000000..d97bdaf3
--- /dev/null
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
@@ -0,0 +1,592 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <climits>
+#include <list>
+#include <map>
+#include <queue>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "MemoryConstraintModules.hpp"
+#include "osp/auxiliary/misc.hpp"
+#include "osp/bsp/model/BspSchedule.hpp"
+#include "osp/bsp/scheduler/Scheduler.hpp"
+#include "osp/graph_algorithms/directed_graph_top_sort.hpp"
+
+namespace osp {
+
+/**
+ * @brief The GreedyVarianceSspScheduler class represents a scheduler that uses a greedy algorithm
+ * with stale synchronous parallel (SSP) execution model.
+ *
+ * It computes schedules for BspInstance using variance-based priorities.
+ */
+template<typename Graph_t, typename MemoryConstraint_t = no_memory_constraint>
+class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
+
+    static_assert(is_computational_dag_v<Graph_t>, "GreedyVarianceSspScheduler can only be used with computational DAGs.");
+
+  private:
+    using VertexType = vertex_idx_t<Graph_t>;
+
+    constexpr static bool use_memory_constraint =
+        is_memory_constraint_v<MemoryConstraint_t> or is_memory_constraint_schedule_v<MemoryConstraint_t>;
+
+    static_assert(not use_memory_constraint or std::is_same_v<Graph_t, typename MemoryConstraint_t::Graph_impl_t>,
+                  "Graph_t must be the same as MemoryConstraint_t::Graph_impl_t.");
+
+    MemoryConstraint_t memory_constraint;
+    double max_percent_idle_processors;
+    bool increase_parallelism_in_new_superstep;
+    std::vector<v_memw_t<Graph_t>> current_proc_persistent_memory;
+    std::vector<v_commw_t<Graph_t>> current_proc_transient_memory;
+
+
+    std::vector<double> compute_work_variance(const Graph_t &graph) const {
+
+        std::vector<double> work_variance(graph.num_vertices(), 0.0);
+
+        const std::vector<VertexType> top_order = GetTopOrder(graph);
+
+        for (auto r_iter = top_order.rbegin(); r_iter != top_order.crend(); r_iter++) {
+            double temp = 0;
+            double max_priority = 0;
+            for (const auto &child : graph.children(*r_iter)) {
+                max_priority = std::max(work_variance[child], max_priority);
+            }
+            for (const auto &child : graph.children(*r_iter)) {
+                temp += std::exp(2 * (work_variance[child] - max_priority));
+            }
+            temp = std::log(temp) / 2 + max_priority;
+
+            double node_weight = std::log((double)std::max(graph.vertex_work_weight(*r_iter), static_cast<v_workw_t<Graph_t>>(1)));
+            double larger_val = node_weight > temp ? node_weight : temp;
+
+            work_variance[*r_iter] =
+                std::log(std::exp(node_weight - larger_val) + std::exp(temp - larger_val)) + larger_val;
+        }
+
+        return work_variance;
+    }
+
+    struct VarianceCompare {
+        bool operator()(const std::pair<VertexType, double> &lhs, const std::pair<VertexType, double> &rhs) const {
+            return ((lhs.second > rhs.second) || ((lhs.second == rhs.second) && (lhs.first < rhs.first)));
+        }
+    };
+
+    bool CanChooseNode(const BspInstance<Graph_t> &instance,
+                       const std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> &allReady,
+                       const std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> &procReady,
+                       const std::vector<bool> &procFree) const {
+        for (unsigned i = 0; i < instance.numberOfProcessors(); ++i)
+            if (procFree[i] && !procReady[i].empty())
+                return true;
+
+        for (unsigned i = 0; i < instance.numberOfProcessors(); ++i)
+            if (procFree[i] && !allReady[instance.getArchitecture().processorType(i)].empty())
+                return true;
+
+        return false;
+    }
+
+    void Choose(const BspInstance<Graph_t> &instance,
+            const std::vector<double> &work_variance,
+            std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> &allReady,
+            std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> &procReady,
+            const std::vector<bool> &procFree,
+            VertexType &node, unsigned &p,
+            const bool endSupStep,
+            const v_workw_t<Graph_t> remaining_time) const 
+    {
+        (void)work_variance; // silence -Werror=unused-parameter
+        double maxScore = -1;
+        bool found_allocation = false;
+        for (unsigned i = 0; i < instance.numberOfProcessors(); ++i) {
+            if (procFree[i] && !procReady[i].empty()) {
+                // select node
+                for (auto node_pair_it = procReady[i].begin(); node_pair_it != procReady[i].end();) {
+                    if (endSupStep &&
+                        (remaining_time < instance.getComputationalDag().vertex_work_weight(node_pair_it->first))) {
+                            node_pair_it = procReady[i].erase(node_pair_it);
+                            continue;
+                    }
+
+                    if (use_memory_constraint) {
+                        if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::LOCAL) {
+                            if (current_proc_persistent_memory[i] +
+                                    instance.getComputationalDag().vertex_mem_weight(node_pair_it->first) >
+                                instance.getArchitecture().memoryBound(i)) {
+
+                                node_pair_it = procReady[i].erase(node_pair_it);
+                                continue;
+                            }
+                        } else if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT) {
+                            if (current_proc_persistent_memory[i] +
+                                    instance.getComputationalDag().vertex_mem_weight(node_pair_it->first) +
+                                    std::max(current_proc_transient_memory[i],
+                                            instance.getComputationalDag().vertex_comm_weight(node_pair_it->first)) >
+                                instance.getArchitecture().memoryBound(i)) {
+
+                                node_pair_it = procReady[i].erase(node_pair_it);
+                                continue;
+                            }
+                        }
+                    }
+
+                    const double &score = node_pair_it->second;
+
+                    if (score > maxScore) {
+                        maxScore = score;
+                        node = node_pair_it->first;
+                        p = i;
+                        found_allocation = true;
+                        break;
+                    }
+                    node_pair_it++;
+                }
+            }
+        }
+
+        if (found_allocation)
+            return;
+        
+        for (unsigned i = 0; i < instance.numberOfProcessors(); ++i) {
+            if (procFree[i] && !allReady[instance.getArchitecture().processorType(i)].empty()) {
+                // select node
+                for (auto it = allReady[instance.getArchitecture().processorType(i)].begin(); it != allReady[instance.getArchitecture().processorType(i)].end();) {
+                    if (endSupStep &&
+                        (remaining_time < instance.getComputationalDag().vertex_work_weight(it->first))) {
+                        it = allReady[instance.getArchitecture().processorType(i)].erase(it);
+                        continue;
+                    }
+
+                    const double &score = it->second;
+
+                    if (score > maxScore) {
+                        if (use_memory_constraint) {
+
+                            if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::LOCAL) {
+
+                                if (current_proc_persistent_memory[i] +
+                                        instance.getComputationalDag().vertex_mem_weight(it->first) <=
+                                    instance.getArchitecture().memoryBound(i)) {
+
+                                    node = it->first;
+                                    p = i;
+                                    return;
+                                }
+
+                            } else if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT) {
+                                if (current_proc_persistent_memory[i] +
+                                        instance.getComputationalDag().vertex_mem_weight(it->first) +
+                                        std::max(current_proc_transient_memory[i],
+                                                instance.getComputationalDag().vertex_comm_weight(it->first)) <=
+                                    instance.getArchitecture().memoryBound(i)) {
+
+                                    node = it->first;
+                                    p = i;
+                                    return;
+                                }
+                            }
+
+                        } else {
+                            node = it->first;
+                            p = i;
+                            return;
+                        }
+                    }
+                    it++;
+                }
+
+            }
+        }
+    };
+
+    bool check_mem_feasibility(
+        const BspInstance<Graph_t> &instance,
+        const std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> &allReady,
+        const std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> &procReady) const 
+    {
+        if constexpr (use_memory_constraint) {
+            if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT) 
+            {
+                for (unsigned i = 0; i < instance.numberOfProcessors(); ++i) {
+                    if (!procReady[i].empty()) {
+
+                        const std::pair<VertexType, double> &node_pair = *procReady[i].begin();
+                        VertexType top_node = node_pair.first;
+
+                        if (memory_constraint.can_add(top_node, i)) {
+                            return true;
+                        }
+                    }
+                }
+
+                for (unsigned i = 0; i < instance.numberOfProcessors(); ++i) {
+
+                    if (allReady[instance.getArchitecture().processorType(i)].empty())
+                        continue;
+
+                    const std::pair<VertexType, double> &node_pair =
+                        *allReady[instance.getArchitecture().processorType(i)].begin();
+                    VertexType top_node = node_pair.first;
+
+                    if (memory_constraint.can_add(top_node, i)) {
+                        return true;
+                    }
+                }
+
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    unsigned get_nr_parallelizable_nodes(
+        const BspInstance<Graph_t> &instance,
+        const unsigned &stale,
+        const std::vector<unsigned> &nr_old_ready_nodes_per_type,
+        const std::vector<unsigned> &nr_ready_nodes_per_type,
+        const std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> &procReady,
+        const std::vector<unsigned> &nr_procs_per_type) const 
+    {
+        unsigned nr_nodes = 0;
+        unsigned num_proc_types = instance.getArchitecture().getNumberOfProcessorTypes();
+
+        std::vector<unsigned> procs_per_type = nr_procs_per_type;
+
+        if (stale > 1) {
+            for (unsigned proc = 0; proc < instance.numberOfProcessors(); proc++) {
+                if (!procReady[proc].empty()) {
+                    procs_per_type[instance.getArchitecture().processorType(proc)]--;
+                    nr_nodes++;
+                }
+            }
+        }
+
+        std::vector<unsigned> ready_nodes_per_type = nr_ready_nodes_per_type;
+        for (unsigned node_type = 0; node_type < ready_nodes_per_type.size(); node_type++) {
+            ready_nodes_per_type[node_type] += nr_old_ready_nodes_per_type[node_type];
+        }
+
+        for (unsigned proc_type = 0; proc_type < num_proc_types; ++proc_type) {
+            for (unsigned node_type = 0; node_type < instance.getComputationalDag().num_vertex_types(); ++node_type) {
+                if (instance.isCompatibleType(node_type, proc_type)) {
+                    unsigned matched = std::min(ready_nodes_per_type[node_type],
+                                                procs_per_type[proc_type]);
+                    nr_nodes += matched;
+                    ready_nodes_per_type[node_type] -= matched;
+                    procs_per_type[proc_type] -= matched;
+                }
+            }
+        }
+
+        return nr_nodes;
+    }
+
+
+    public:
+    virtual RETURN_STATUS computeSspSchedule(BspSchedule<Graph_t> &schedule, unsigned stale) {
+        const auto &instance = schedule.getInstance();
+        const auto N = instance.numberOfVertices();
+        const auto P = instance.numberOfProcessors();
+        const auto &G = instance.getComputationalDag();
+
+        if constexpr (use_memory_constraint) {
+            memory_constraint.initialize(schedule, 0); // SSP starts at supstepIdx = 0
+        }
+
+        // Reset processor assignments
+        for (auto v : G.vertices()) {
+            schedule.setAssignedProcessor(v, std::numeric_limits<unsigned>::max());
+        }
+
+        const std::vector<double> work_variances = compute_work_variance(G);
+
+        std::set<std::pair<VertexType, double>, VarianceCompare> old_ready;
+        std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> ready(stale);
+        std::vector<std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>>> procReady(
+            stale, std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>>(P));
+        std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> allReady(
+            instance.getArchitecture().getNumberOfProcessorTypes());
+
+        const auto procTypesCompatibleWithNodeType = instance.getProcTypesCompatibleWithNodeType();
+
+        std::vector<unsigned> nr_old_ready_nodes_per_type(G.num_vertex_types(), 0);
+        std::vector<std::vector<unsigned>> nr_ready_stale_nodes_per_type(
+            stale, std::vector<unsigned>(G.num_vertex_types(), 0));
+        std::vector<unsigned> nr_procs_per_type(instance.getArchitecture().getNumberOfProcessorTypes(), 0);
+        for (auto proc = 0u; proc < P; ++proc) {
+            ++nr_procs_per_type[instance.getArchitecture().processorType(proc)];
+        }
+
+        std::vector<unsigned> nrPredecRemain(N);
+        for (VertexType node = 0; node < static_cast<VertexType>(N); ++node) {
+            const auto num_parents = G.in_degree(node);
+            nrPredecRemain[node] = static_cast<unsigned>(num_parents);
+            if (num_parents == 0) {
+                ready[0].insert(std::make_pair(node, work_variances[node]));
+                nr_ready_stale_nodes_per_type[0][G.vertex_type(node)]++;
+            }
+        }
+
+        std::vector<bool> procFree(P, true);
+        unsigned free = P;
+
+        std::set<std::pair<v_workw_t<Graph_t>, VertexType>> finishTimes;
+        finishTimes.emplace(0, std::numeric_limits<VertexType>::max());
+
+        std::vector<unsigned> number_of_allocated_allReady_tasks_in_superstep(instance.getArchitecture().getNumberOfProcessorTypes(), 0);
+        std::vector<unsigned> limit_of_number_of_allocated_allReady_tasks_in_superstep(instance.getArchitecture().getNumberOfProcessorTypes(), 0);
+
+        unsigned supstepIdx = 0u;
+        bool endSupStep = true;
+        bool begin_outer_while = true;
+        bool able_to_schedule_in_step = false;
+        unsigned successive_empty_supersteps = 0u;
+
+        auto nonempty_ready = [&]() {
+            return std::any_of(ready.cbegin(), ready.cend(),
+                             [](const std::set<std::pair<VertexType, double>, VarianceCompare>& ready_set) { return !ready_set.empty(); });
+        };
+
+        while (!old_ready.empty() || nonempty_ready() || !finishTimes.empty()) {
+            if (finishTimes.empty() && endSupStep) {
+                able_to_schedule_in_step = false;
+                number_of_allocated_allReady_tasks_in_superstep = std::vector<unsigned>(instance.getArchitecture().getNumberOfProcessorTypes(), 0);
+
+                for (unsigned i = 0; i < P; ++i)
+                    procReady[supstepIdx % stale][i].clear();
+
+                if (!begin_outer_while) {
+                    supstepIdx++;
+                } else {
+                    begin_outer_while = false;
+                }
+
+                 for (unsigned procType = 0; procType < instance.getArchitecture().getNumberOfProcessorTypes(); ++procType)
+                    allReady[procType].clear();
+
+                old_ready.insert(ready[supstepIdx % stale].begin(), ready[supstepIdx % stale].end());
+                ready[supstepIdx % stale].clear();
+                for (unsigned node_type = 0; node_type < G.num_vertex_types(); ++node_type) {
+                    nr_old_ready_nodes_per_type[node_type] += nr_ready_stale_nodes_per_type[supstepIdx % stale][node_type];
+                    nr_ready_stale_nodes_per_type[supstepIdx % stale][node_type] = 0;
+                }
+
+                for (const auto &nodeAndValuePair : old_ready) {
+                    VertexType node = nodeAndValuePair.first;
+                    for (unsigned procType : procTypesCompatibleWithNodeType[G.vertex_type(node)]) {
+                        allReady[procType].insert(allReady[procType].end(), nodeAndValuePair);
+                    }
+                }
+
+                if constexpr (use_memory_constraint) {
+                    if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::LOCAL) {
+                        for (unsigned proc = 0; proc < P; proc++)
+                            memory_constraint.reset(proc);
+                    }
+                }
+
+                for (unsigned procType = 0; procType < instance.getArchitecture().getNumberOfProcessorTypes(); procType++) {
+                    unsigned equal_split = (static_cast<unsigned int>(allReady[procType].size()) + stale - 1) / stale;
+                    unsigned at_least_for_long_step = 3 * nr_procs_per_type[procType];
+
+                    limit_of_number_of_allocated_allReady_tasks_in_superstep[procType] = std::max(at_least_for_long_step, equal_split);
+                }
+
+                endSupStep = false;
+                finishTimes.emplace(0, std::numeric_limits<VertexType>::max());
+            }
+
+            const v_workw_t<Graph_t> time = finishTimes.begin()->first;
+            const v_workw_t<Graph_t> max_finish_time = finishTimes.rbegin()->first;
+
+            // Find new ready jobs
+            while (!finishTimes.empty() && finishTimes.begin()->first == time) {
+                const VertexType node = finishTimes.begin()->second;
+                finishTimes.erase(finishTimes.begin());
+                if (node != std::numeric_limits<VertexType>::max()) {
+                    for (const auto &succ : G.children(node)) {
+                        nrPredecRemain[succ]--;
+                        if (nrPredecRemain[succ] == 0) {
+                            ready[supstepIdx % stale].emplace(succ, work_variances[succ]);
+                            nr_ready_stale_nodes_per_type[supstepIdx % stale][G.vertex_type(succ)]++;
+
+                            bool canAdd = instance.isCompatible(succ, schedule.assignedProcessor(node));
+                            unsigned earliest_add = supstepIdx;
+                            bool memory_ok = true;
+
+                            for (const auto &pred : G.parents(succ)) {
+                                if (schedule.assignedProcessor(pred) != schedule.assignedProcessor(node))
+                                    earliest_add = std::max(earliest_add, stale + schedule.assignedSuperstep(pred));
+                            }
+
+                            if (use_memory_constraint && canAdd && (earliest_add == supstepIdx)) {
+
+                                if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::LOCAL) {
+
+                                    if (current_proc_persistent_memory[schedule.assignedProcessor(node)] +
+                                            instance.getComputationalDag().vertex_mem_weight(succ) >
+                                        instance.getArchitecture().memoryBound(schedule.assignedProcessor(node))) {
+                                        memory_ok = false;
+                                    }
+
+                                } else if (instance.getArchitecture().getMemoryConstraintType() ==
+                                        MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT) {
+
+                                    if (current_proc_persistent_memory[schedule.assignedProcessor(node)] +
+                                            instance.getComputationalDag().vertex_mem_weight(succ) +
+                                            std::max(current_proc_transient_memory[schedule.assignedProcessor(node)],
+                                                    instance.getComputationalDag().vertex_comm_weight(succ)) >
+                                        instance.getArchitecture().memoryBound(schedule.assignedProcessor(node))) {
+                                        memory_ok = false;
+                                    }
+                                }
+                            }
+
+                            if (canAdd) {
+                                for (unsigned step_to_add = earliest_add; step_to_add < supstepIdx + stale; step_to_add++) {
+                                    if ((step_to_add == supstepIdx) && !memory_ok) {
+                                        continue;
+                                    }
+                                    procReady[step_to_add % stale][schedule.assignedProcessor(node)].emplace(succ, work_variances[succ]);
+                                }
+                            }
+                        }
+                    }
+
+                    procFree[schedule.assignedProcessor(node)] = true;
+                    ++free;
+                }
+            }
+
+            // Assign new jobs
+            if (!CanChooseNode(instance, allReady, procReady[supstepIdx % stale], procFree)) {
+                endSupStep = true;
+            }
+            while (CanChooseNode(instance, allReady, procReady[supstepIdx % stale], procFree)) {
+                std::cout << "10" << std::endl;
+                VertexType nextNode = std::numeric_limits<VertexType>::max();
+                unsigned nextProc = P;
+
+                Choose( instance, work_variances, allReady, 
+                        procReady[supstepIdx % stale], procFree, 
+                        nextNode, nextProc, endSupStep, max_finish_time - time);
+
+                if (nextNode == std::numeric_limits<VertexType>::max() || nextProc == P) {
+                    endSupStep = true;
+                    break;
+                }
+
+                if (procReady[supstepIdx % stale][nextProc].find(std::make_pair(nextNode, work_variances[nextNode])) !=
+                    procReady[supstepIdx % stale][nextProc].end()) {
+                    for (size_t i = 0; i < stale; i++) {
+                        procReady[i][nextProc].erase(std::make_pair(nextNode, work_variances[nextNode]));
+                    }
+                } else {
+                    for(unsigned procType : procTypesCompatibleWithNodeType[G.vertex_type(nextNode)]) {
+                        allReady[procType].erase(std::make_pair(nextNode, work_variances[nextNode]));
+                    }
+                    nr_old_ready_nodes_per_type[G.vertex_type(nextNode)]--;
+                    const unsigned nextProcType = instance.getArchitecture().processorType(nextProc);
+                    number_of_allocated_allReady_tasks_in_superstep[nextProcType]++;
+                    if (number_of_allocated_allReady_tasks_in_superstep[nextProcType] >= limit_of_number_of_allocated_allReady_tasks_in_superstep[nextProcType]) {
+                        allReady[nextProcType].clear();
+                    }
+                }
+
+                for (size_t i = 0; i < stale; i++) {
+                    ready[i].erase(std::make_pair(nextNode, work_variances[nextNode]));
+                }
+                old_ready.erase(std::make_pair(nextNode, work_variances[nextNode]));
+
+                schedule.setAssignedProcessor(nextNode, nextProc);
+                schedule.setAssignedSuperstep(nextNode, supstepIdx);
+                able_to_schedule_in_step = true;
+
+                if constexpr (use_memory_constraint) {
+                    memory_constraint.add(nextNode, nextProc);
+
+                    std::vector<std::pair<VertexType, double>> toErase;
+                    for (const auto &node_pair : procReady[supstepIdx % stale][nextProc]) {
+                        if (!memory_constraint.can_add(node_pair.first, nextProc)) {
+                            toErase.push_back(node_pair);
+                        }
+                    }
+                    for (const auto &n : toErase) {
+                        procReady[supstepIdx % stale][nextProc].erase(n);
+                    }
+                }
+
+                finishTimes.emplace(time + G.vertex_work_weight(nextNode), nextNode);
+                procFree[nextProc] = false;
+                --free;
+            }
+
+            if (able_to_schedule_in_step)
+                successive_empty_supersteps = 0;
+            else if (++successive_empty_supersteps > 100u + stale)
+                return RETURN_STATUS::ERROR;
+
+            if (free > static_cast<decltype(free)>(P * max_percent_idle_processors) &&
+                ((!increase_parallelism_in_new_superstep) ||
+                get_nr_parallelizable_nodes(instance, stale,
+                                            nr_old_ready_nodes_per_type,
+                                            nr_ready_stale_nodes_per_type[(supstepIdx + 1) % stale],
+                                            procReady[(supstepIdx + 1) % stale],
+                                            nr_procs_per_type) >=
+                    std::min(std::min(P, static_cast<unsigned>(1.2 * (P - free))),
+                            P - free + static_cast<unsigned>(0.5 * free)))) {
+                endSupStep = true;
+            }
+        }
+
+        assert(schedule.satisfiesPrecedenceConstraints());
+        //schedule.setAutoCommunicationSchedule();
+
+        return RETURN_STATUS::OSP_SUCCESS;
+    }
+
+
+
+    virtual RETURN_STATUS computeSchedule(BspSchedule<Graph_t> &schedule) override {
+        // Default SSP = 1 (no staleness) or maybe configurable later
+        return computeSspSchedule(schedule, /*stale=*/1);
+    }
+
+    virtual std::string getScheduleName() const override {
+        if constexpr (use_memory_constraint) {
+            return "GreedyVarianceSspMemory";
+        } else {
+            return "GreedyVarianceSsp";
+        }
+    }
+
+};
+
+} // namespace osp
diff --git a/tests/bsp_schedulers.cpp b/tests/bsp_schedulers.cpp
index 936bad69..94a9161b 100644
--- a/tests/bsp_schedulers.cpp
+++ b/tests/bsp_schedulers.cpp
@@ -33,6 +33,7 @@ limitations under the License.
 #include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/VarianceFillup.hpp"
+#include "osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp"
 #include "osp/bsp/scheduler/LoadBalanceScheduler/LightEdgeVariancePartitioner.hpp"
 #include "osp/bsp/scheduler/LoadBalanceScheduler/VariancePartitioner.hpp"
 #include "osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp"
@@ -209,6 +210,12 @@ BOOST_AUTO_TEST_CASE(variancefillup_test) {
     run_test(&test);
 }
 
+BOOST_AUTO_TEST_CASE(greedyvariancesspscheduler_test){
+    GreedyVarianceSspScheduler<computational_dag_vector_impl_def_t> test;
+    run_test(&test);
+}
+
+
 BOOST_AUTO_TEST_CASE(etf_test_edge_desc_impl) {
 
     EtfScheduler<computational_dag_edge_idx_vector_impl_def_t> test;

From 07367391d91d58b2731e50074ee4d4f87c7a6156 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Thu, 9 Oct 2025 15:59:18 +0200
Subject: [PATCH 2/6] GreedyVarianceSspScheduler: adding MaxBspSchedule
 (staleness=2) support and tests

---
 include/osp/bsp/model/BspSchedule.hpp         |   2 +-
 .../GreedyVarianceSspScheduler.hpp            | 320 ++++++++++--------
 include/osp/bsp/scheduler/MaxBspScheduler.hpp |  87 +++++
 tests/bsp_schedulers.cpp                      |  38 ++-
 4 files changed, 307 insertions(+), 140 deletions(-)
 create mode 100644 include/osp/bsp/scheduler/MaxBspScheduler.hpp

diff --git a/include/osp/bsp/model/BspSchedule.hpp b/include/osp/bsp/model/BspSchedule.hpp
index 299e1716..b30cb26c 100644
--- a/include/osp/bsp/model/BspSchedule.hpp
+++ b/include/osp/bsp/model/BspSchedule.hpp
@@ -65,7 +65,7 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
     unsigned number_of_supersteps;
 
     std::vector<unsigned> node_to_processor_assignment;
-    std::vector<unsigned> node_to_superstep_assignment;
+    std::vector<unsigned> node_to_superstep_assignment; 
 
     template<unsigned staleness>
     inline bool satisfies_precedence_constraints_staleness() const {
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
index d97bdaf3..2a8b8d99 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
@@ -32,6 +32,7 @@ limitations under the License.
 #include "MemoryConstraintModules.hpp"
 #include "osp/auxiliary/misc.hpp"
 #include "osp/bsp/model/BspSchedule.hpp"
+#include "osp/bsp/model/MaxBspSchedule.hpp"
 #include "osp/bsp/scheduler/Scheduler.hpp"
 #include "osp/graph_algorithms/directed_graph_top_sort.hpp"
 
@@ -60,14 +61,9 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
     MemoryConstraint_t memory_constraint;
     double max_percent_idle_processors;
     bool increase_parallelism_in_new_superstep;
-    std::vector<v_memw_t<Graph_t>> current_proc_persistent_memory;
-    std::vector<v_commw_t<Graph_t>> current_proc_transient_memory;
-
 
     std::vector<double> compute_work_variance(const Graph_t &graph) const {
-
         std::vector<double> work_variance(graph.num_vertices(), 0.0);
-
         const std::vector<VertexType> top_order = GetTopOrder(graph);
 
         for (auto r_iter = top_order.rbegin(); r_iter != top_order.crend(); r_iter++) {
@@ -91,6 +87,28 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
         return work_variance;
     }
 
+    std::vector<std::vector<std::vector<unsigned>>>
+    procTypesCompatibleWithNodeType_omit_procType(const BspInstance<Graph_t> &instance) const {
+
+        const std::vector<std::vector<unsigned>> procTypesCompatibleWithNodeType =
+            instance.getProcTypesCompatibleWithNodeType();
+
+        std::vector<std::vector<std::vector<unsigned>>> procTypesCompatibleWithNodeType_skip(
+            instance.getArchitecture().getNumberOfProcessorTypes(),
+            std::vector<std::vector<unsigned>>(instance.getComputationalDag().num_vertex_types()));
+        for (unsigned procType = 0; procType < instance.getArchitecture().getNumberOfProcessorTypes(); procType++) {
+            for (unsigned nodeType = 0; nodeType < instance.getComputationalDag().num_vertex_types(); nodeType++) {
+                for (unsigned otherProcType : procTypesCompatibleWithNodeType[nodeType]) {
+                    if (procType == otherProcType)
+                        continue;
+                    procTypesCompatibleWithNodeType_skip[procType][nodeType].emplace_back(otherProcType);
+                }
+            }
+        }
+
+        return procTypesCompatibleWithNodeType_skip;
+    }
+
     struct VarianceCompare {
         bool operator()(const std::pair<VertexType, double> &lhs, const std::pair<VertexType, double> &rhs) const {
             return ((lhs.second > rhs.second) || ((lhs.second == rhs.second) && (lhs.first < rhs.first)));
@@ -119,112 +137,150 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
             const std::vector<bool> &procFree,
             VertexType &node, unsigned &p,
             const bool endSupStep,
-            const v_workw_t<Graph_t> remaining_time) const 
+            const v_workw_t<Graph_t> remaining_time,
+            const std::vector<std::vector<std::vector<unsigned>>> &procTypesCompatibleWithNodeType_skip_proctype) const 
     {
-        (void)work_variance; // silence -Werror=unused-parameter
         double maxScore = -1;
         bool found_allocation = false;
+
         for (unsigned i = 0; i < instance.numberOfProcessors(); ++i) {
-            if (procFree[i] && !procReady[i].empty()) {
-                // select node
-                for (auto node_pair_it = procReady[i].begin(); node_pair_it != procReady[i].end();) {
-                    if (endSupStep &&
-                        (remaining_time < instance.getComputationalDag().vertex_work_weight(node_pair_it->first))) {
-                            node_pair_it = procReady[i].erase(node_pair_it);
-                            continue;
-                    }
+            if (!procFree[i] || procReady[i].empty())
+                continue;
+
+            auto it = procReady[i].begin();
+            while (it != procReady[i].end()) {
+                if (endSupStep &&
+                    (remaining_time < instance.getComputationalDag().vertex_work_weight(it->first))) {
+                    it = procReady[i].erase(it);
+                    continue;
+                }
 
-                    if (use_memory_constraint) {
-                        if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::LOCAL) {
-                            if (current_proc_persistent_memory[i] +
-                                    instance.getComputationalDag().vertex_mem_weight(node_pair_it->first) >
-                                instance.getArchitecture().memoryBound(i)) {
+                const double &score = it->second;
 
-                                node_pair_it = procReady[i].erase(node_pair_it);
-                                continue;
-                            }
-                        } else if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT) {
-                            if (current_proc_persistent_memory[i] +
-                                    instance.getComputationalDag().vertex_mem_weight(node_pair_it->first) +
-                                    std::max(current_proc_transient_memory[i],
-                                            instance.getComputationalDag().vertex_comm_weight(node_pair_it->first)) >
-                                instance.getArchitecture().memoryBound(i)) {
-
-                                node_pair_it = procReady[i].erase(node_pair_it);
-                                continue;
-                            }
-                        }
-                    }
+                if (score > maxScore) {
+                    const unsigned procType = instance.getArchitecture().processorType(i);
 
-                    const double &score = node_pair_it->second;
+                    if constexpr (use_memory_constraint) {
+                        if (memory_constraint.can_add(it->first, i)) {
+                            node = it->first;
+                            p = i;
+                            found_allocation = true;
+
+                            procReady[i].erase(it);
+
+                            if (procType < procTypesCompatibleWithNodeType_skip_proctype.size()) {
+                                const auto &compatibleTypes =
+                                    procTypesCompatibleWithNodeType_skip_proctype[procType]
+                                        [instance.getComputationalDag().vertex_type(node)];
+
+                                for (unsigned otherType : compatibleTypes) {
+                                    for (unsigned j = 0; j < instance.numberOfProcessors(); ++j) {
+                                        if (j != i &&
+                                            instance.getArchitecture().processorType(j) == otherType &&
+                                            j < procReady.size()) {
+                                            procReady[j].erase(std::make_pair(node, work_variance[node]));
+                                        }
+                                    }
+                                }
+                            }
 
-                    if (score > maxScore) {
-                        maxScore = score;
-                        node = node_pair_it->first;
+                            return;
+                        }
+                    } else {
+                        node = it->first;
                         p = i;
                         found_allocation = true;
-                        break;
+
+                        procReady[i].erase(it);
+
+                        if (procType < procTypesCompatibleWithNodeType_skip_proctype.size()) {
+                            const auto &compatibleTypes =
+                                procTypesCompatibleWithNodeType_skip_proctype[procType]
+                                    [instance.getComputationalDag().vertex_type(node)];
+
+                            for (unsigned otherType : compatibleTypes) {
+                                for (unsigned j = 0; j < instance.numberOfProcessors(); ++j) {
+                                    if (j != i &&
+                                        instance.getArchitecture().processorType(j) == otherType &&
+                                        j < procReady.size()) {
+                                        procReady[j].erase(std::make_pair(node, work_variance[node]));
+                                    }
+                                }
+                            }
+                        }
+
+                        return;
                     }
-                    node_pair_it++;
                 }
+
+                ++it;
             }
         }
 
         if (found_allocation)
             return;
-        
-        for (unsigned i = 0; i < instance.numberOfProcessors(); ++i) {
-            if (procFree[i] && !allReady[instance.getArchitecture().processorType(i)].empty()) {
-                // select node
-                for (auto it = allReady[instance.getArchitecture().processorType(i)].begin(); it != allReady[instance.getArchitecture().processorType(i)].end();) {
-                    if (endSupStep &&
-                        (remaining_time < instance.getComputationalDag().vertex_work_weight(it->first))) {
-                        it = allReady[instance.getArchitecture().processorType(i)].erase(it);
-                        continue;
-                    }
-
-                    const double &score = it->second;
 
-                    if (score > maxScore) {
-                        if (use_memory_constraint) {
+        for (unsigned i = 0; i < instance.numberOfProcessors(); ++i) {
+            const unsigned procType = instance.getArchitecture().processorType(i);
+            if (!procFree[i] || procType >= allReady.size() || allReady[procType].empty())
+                continue;
+
+            auto &readyList = allReady[procType];
+            auto it = readyList.begin();
+
+            while (it != readyList.end()) {
+                if (endSupStep &&
+                    (remaining_time < instance.getComputationalDag().vertex_work_weight(it->first))) {
+                    it = readyList.erase(it);
+                    continue;
+                }
 
-                            if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::LOCAL) {
+                const double &score = it->second;
 
-                                if (current_proc_persistent_memory[i] +
-                                        instance.getComputationalDag().vertex_mem_weight(it->first) <=
-                                    instance.getArchitecture().memoryBound(i)) {
+                if (score > maxScore) {
+                    if constexpr (use_memory_constraint) {
+                        if (memory_constraint.can_add(it->first, i)) {
+                            node = it->first;
+                            p = i;
 
-                                    node = it->first;
-                                    p = i;
-                                    return;
-                                }
+                            readyList.erase(it);
 
-                            } else if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT) {
-                                if (current_proc_persistent_memory[i] +
-                                        instance.getComputationalDag().vertex_mem_weight(it->first) +
-                                        std::max(current_proc_transient_memory[i],
-                                                instance.getComputationalDag().vertex_comm_weight(it->first)) <=
-                                    instance.getArchitecture().memoryBound(i)) {
+                            const auto &compatibleTypes =
+                                procTypesCompatibleWithNodeType_skip_proctype[procType]
+                                    [instance.getComputationalDag().vertex_type(node)];
 
-                                    node = it->first;
-                                    p = i;
-                                    return;
-                                }
+                            for (unsigned otherType : compatibleTypes) {
+                                if (otherType < allReady.size())
+                                    allReady[otherType].erase(std::make_pair(node, work_variance[node]));
                             }
 
-                        } else {
-                            node = it->first;
-                            p = i;
                             return;
                         }
+                    } else {
+                        node = it->first;
+                        p = i;
+
+                        readyList.erase(it);
+
+                        const auto &compatibleTypes =
+                            procTypesCompatibleWithNodeType_skip_proctype[procType]
+                                [instance.getComputationalDag().vertex_type(node)];
+
+                        for (unsigned otherType : compatibleTypes) {
+                            if (otherType < allReady.size())
+                                allReady[otherType].erase(std::make_pair(node, work_variance[node]));
+                        }
+
+                        return;
                     }
-                    it++;
                 }
 
+                ++it;
             }
         }
     };
 
+
     bool check_mem_feasibility(
         const BspInstance<Graph_t> &instance,
         const std::vector<std::set<std::pair<VertexType, double>, VarianceCompare>> &allReady,
@@ -310,20 +366,25 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
 
 
     public:
-    virtual RETURN_STATUS computeSspSchedule(BspSchedule<Graph_t> &schedule, unsigned stale) {
+
+    RETURN_STATUS computeSspSchedule(BspSchedule<Graph_t> &schedule, unsigned stale) {
+
         const auto &instance = schedule.getInstance();
-        const auto N = instance.numberOfVertices();
-        const auto P = instance.numberOfProcessors();
         const auto &G = instance.getComputationalDag();
+        const auto &N = instance.numberOfVertices();
+        const unsigned &P = instance.numberOfProcessors();
 
-        if constexpr (use_memory_constraint) {
-            memory_constraint.initialize(schedule, 0); // SSP starts at supstepIdx = 0
-        }
-
-        // Reset processor assignments
         for (auto v : G.vertices()) {
             schedule.setAssignedProcessor(v, std::numeric_limits<unsigned>::max());
         }
+        
+        unsigned supstepIdx = 0;
+
+        if constexpr (is_memory_constraint_v<MemoryConstraint_t>) {
+            memory_constraint.initialize(instance);
+        } else if constexpr (is_memory_constraint_schedule_v<MemoryConstraint_t>) {
+            memory_constraint.initialize(schedule, supstepIdx);
+        }
 
         const std::vector<double> work_variances = compute_work_variance(G);
 
@@ -335,6 +396,8 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
             instance.getArchitecture().getNumberOfProcessorTypes());
 
         const auto procTypesCompatibleWithNodeType = instance.getProcTypesCompatibleWithNodeType();
+        const std::vector<std::vector<std::vector<unsigned>>> procTypesCompatibleWithNodeType_skip_proctype =
+            procTypesCompatibleWithNodeType_omit_procType(instance);
 
         std::vector<unsigned> nr_old_ready_nodes_per_type(G.num_vertex_types(), 0);
         std::vector<std::vector<unsigned>> nr_ready_stale_nodes_per_type(
@@ -363,7 +426,9 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
         std::vector<unsigned> number_of_allocated_allReady_tasks_in_superstep(instance.getArchitecture().getNumberOfProcessorTypes(), 0);
         std::vector<unsigned> limit_of_number_of_allocated_allReady_tasks_in_superstep(instance.getArchitecture().getNumberOfProcessorTypes(), 0);
 
-        unsigned supstepIdx = 0u;
+        
+
+
         bool endSupStep = true;
         bool begin_outer_while = true;
         bool able_to_schedule_in_step = false;
@@ -430,57 +495,45 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
             while (!finishTimes.empty() && finishTimes.begin()->first == time) {
                 const VertexType node = finishTimes.begin()->second;
                 finishTimes.erase(finishTimes.begin());
+
                 if (node != std::numeric_limits<VertexType>::max()) {
-                    for (const auto &succ : G.children(node)) {
+                    const unsigned proc_of_node = schedule.assignedProcessor(node);
+
+                    for (const auto& succ : G.children(node)) {
                         nrPredecRemain[succ]--;
                         if (nrPredecRemain[succ] == 0) {
                             ready[supstepIdx % stale].emplace(succ, work_variances[succ]);
                             nr_ready_stale_nodes_per_type[supstepIdx % stale][G.vertex_type(succ)]++;
 
-                            bool canAdd = instance.isCompatible(succ, schedule.assignedProcessor(node));
                             unsigned earliest_add = supstepIdx;
-                            bool memory_ok = true;
-
-                            for (const auto &pred : G.parents(succ)) {
-                                if (schedule.assignedProcessor(pred) != schedule.assignedProcessor(node))
-                                    earliest_add = std::max(earliest_add, stale + schedule.assignedSuperstep(pred));
+                            for (const auto& pred : G.parents(succ)) {
+                                if (schedule.assignedProcessor(pred) != proc_of_node) {
+                                    earliest_add = std::max(earliest_add,
+                                                            stale + schedule.assignedSuperstep(pred));
+                                }
                             }
 
-                            if (use_memory_constraint && canAdd && (earliest_add == supstepIdx)) {
+                            if (instance.isCompatible(succ, proc_of_node)) {
+                                bool memory_ok = true;
 
-                                if (instance.getArchitecture().getMemoryConstraintType() == MEMORY_CONSTRAINT_TYPE::LOCAL) {
-
-                                    if (current_proc_persistent_memory[schedule.assignedProcessor(node)] +
-                                            instance.getComputationalDag().vertex_mem_weight(succ) >
-                                        instance.getArchitecture().memoryBound(schedule.assignedProcessor(node))) {
-                                        memory_ok = false;
-                                    }
-
-                                } else if (instance.getArchitecture().getMemoryConstraintType() ==
-                                        MEMORY_CONSTRAINT_TYPE::PERSISTENT_AND_TRANSIENT) {
-
-                                    if (current_proc_persistent_memory[schedule.assignedProcessor(node)] +
-                                            instance.getComputationalDag().vertex_mem_weight(succ) +
-                                            std::max(current_proc_transient_memory[schedule.assignedProcessor(node)],
-                                                    instance.getComputationalDag().vertex_comm_weight(succ)) >
-                                        instance.getArchitecture().memoryBound(schedule.assignedProcessor(node))) {
-                                        memory_ok = false;
+                                if constexpr (use_memory_constraint) {
+                                    if (earliest_add == supstepIdx) {
+                                        memory_ok = memory_constraint.can_add(succ, proc_of_node);
                                     }
                                 }
-                            }
-
-                            if (canAdd) {
-                                for (unsigned step_to_add = earliest_add; step_to_add < supstepIdx + stale; step_to_add++) {
+                                for (unsigned step_to_add = earliest_add;
+                                    step_to_add < supstepIdx + stale; ++step_to_add) {
                                     if ((step_to_add == supstepIdx) && !memory_ok) {
-                                        continue;
+                                        continue; 
                                     }
-                                    procReady[step_to_add % stale][schedule.assignedProcessor(node)].emplace(succ, work_variances[succ]);
+                                    procReady[step_to_add % stale][proc_of_node].emplace(
+                                        succ, work_variances[succ]);
                                 }
                             }
                         }
                     }
 
-                    procFree[schedule.assignedProcessor(node)] = true;
+                    procFree[proc_of_node] = true;
                     ++free;
                 }
             }
@@ -490,13 +543,12 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                 endSupStep = true;
             }
             while (CanChooseNode(instance, allReady, procReady[supstepIdx % stale], procFree)) {
-                std::cout << "10" << std::endl;
                 VertexType nextNode = std::numeric_limits<VertexType>::max();
                 unsigned nextProc = P;
 
                 Choose( instance, work_variances, allReady, 
                         procReady[supstepIdx % stale], procFree, 
-                        nextNode, nextProc, endSupStep, max_finish_time - time);
+                        nextNode, nextProc, endSupStep, max_finish_time - time, procTypesCompatibleWithNodeType_skip_proctype);
 
                 if (nextNode == std::numeric_limits<VertexType>::max() || nextProc == P) {
                     endSupStep = true;
@@ -550,18 +602,19 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
 
             if (able_to_schedule_in_step)
                 successive_empty_supersteps = 0;
-            else if (++successive_empty_supersteps > 100u + stale)
+            else if (++successive_empty_supersteps > 100 + stale)
                 return RETURN_STATUS::ERROR;
 
             if (free > static_cast<decltype(free)>(P * max_percent_idle_processors) &&
                 ((!increase_parallelism_in_new_superstep) ||
-                get_nr_parallelizable_nodes(instance, stale,
-                                            nr_old_ready_nodes_per_type,
-                                            nr_ready_stale_nodes_per_type[(supstepIdx + 1) % stale],
-                                            procReady[(supstepIdx + 1) % stale],
-                                            nr_procs_per_type) >=
-                    std::min(std::min(P, static_cast<unsigned>(1.2 * (P - free))),
-                            P - free + static_cast<unsigned>(0.5 * free)))) {
+                get_nr_parallelizable_nodes(
+                    instance, stale, nr_old_ready_nodes_per_type,
+                    nr_ready_stale_nodes_per_type[(supstepIdx + 1) % stale],
+                    procReady[(supstepIdx + 1) % stale],
+                    nr_procs_per_type) >= std::min(
+                                            std::min(P, static_cast<unsigned>(1.2 * (P - free))),
+                                            P - free + static_cast<unsigned>(0.5 * free)))) 
+            {
                 endSupStep = true;
             }
         }
@@ -572,14 +625,15 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
         return RETURN_STATUS::OSP_SUCCESS;
     }
 
+    RETURN_STATUS computeSchedule(BspSchedule<Graph_t> &schedule) override {
+        return computeSspSchedule(schedule, 1U);
+    }
 
-
-    virtual RETURN_STATUS computeSchedule(BspSchedule<Graph_t> &schedule) override {
-        // Default SSP = 1 (no staleness) or maybe configurable later
-        return computeSspSchedule(schedule, /*stale=*/1);
+    RETURN_STATUS computeSchedule(MaxBspSchedule<Graph_t> &schedule) {
+        return computeSspSchedule(schedule, 2U);
     }
 
-    virtual std::string getScheduleName() const override {
+    std::string getScheduleName() const override {
         if constexpr (use_memory_constraint) {
             return "GreedyVarianceSspMemory";
         } else {
diff --git a/include/osp/bsp/scheduler/MaxBspScheduler.hpp b/include/osp/bsp/scheduler/MaxBspScheduler.hpp
new file mode 100644
index 00000000..76f525f7
--- /dev/null
+++ b/include/osp/bsp/scheduler/MaxBspScheduler.hpp
@@ -0,0 +1,87 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Christos Matzoros, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#pragma once
+
+#include "osp/bsp/model/MaxBspSchedule.hpp"
+#include "osp/bsp/model/MaxBspScheduleCS.hpp"
+#include "osp/bsp/scheduler/Scheduler.hpp"
+
+namespace osp {
+
+/**
+ * @class Scheduler
+ * @brief Abstract base class for scheduling scheduler.
+ *
+ * The Scheduler class provides a common interface for scheduling scheduler in the BSP scheduling system.
+ * It defines methods for setting and getting the time limit, as well as computing schedules.
+ */
+template<typename Graph_t>
+class MaxBspScheduler : public Scheduler<Graph_t> {
+
+    static_assert(is_computational_dag_v<Graph_t>, "BspSchedule can only be used with computational DAGs.");
+
+    /**
+     * @brief Get the name of the scheduling algorithm.
+     * @return The name of the scheduling algorithm.
+     */
+    virtual std::string getScheduleName() const override = 0;
+
+    /**
+     * @brief Compute a BSP schedule for the given BSP instance.
+     * @param instance The BSP instance for which to compute the schedule.
+     * @return A pair containing the return status and the computed schedule.
+     */
+    virtual RETURN_STATUS computeSchedule(BspSchedule<Graph_t> &schedule) override {
+        MaxBspSchedule<Graph_t> tmpSched(schedule.getInstance());
+        RETURN_STATUS status = computeSchedule(tmpSched);
+        schedule = tmpSched;
+        return status;
+    }
+
+    virtual RETURN_STATUS computeScheduleCS(BspScheduleCS<Graph_t> &schedule) {
+
+        auto result = computeSchedule(schedule);
+        if (result == RETURN_STATUS::OSP_SUCCESS || result == RETURN_STATUS::BEST_FOUND) {
+            schedule.setAutoCommunicationSchedule();
+            return result;
+        } else {
+            return RETURN_STATUS::ERROR;
+        }
+    }
+
+    /**
+     * @brief Compute a BSP schedule for the given BSP instance.
+     * @param instance The BSP instance for which to compute the schedule.
+     * @return A pair containing the return status and the computed schedule.
+     */
+    virtual RETURN_STATUS computeSchedule(MaxBspSchedule<Graph_t> &schedule) = 0;
+
+    virtual RETURN_STATUS computeScheduleCS(MaxBspScheduleCS<Graph_t> &schedule) {
+// Fix me todo
+        auto result = computeSchedule(schedule);
+        if (result == RETURN_STATUS::OSP_SUCCESS || result == RETURN_STATUS::BEST_FOUND) {
+            // schedule.setAutoCommunicationSchedule();
+            return result;
+        } else {
+            return RETURN_STATUS::ERROR;
+        }
+    }
+};
+
+} // namespace osp
\ No newline at end of file
diff --git a/tests/bsp_schedulers.cpp b/tests/bsp_schedulers.cpp
index 94a9161b..d05b3b66 100644
--- a/tests/bsp_schedulers.cpp
+++ b/tests/bsp_schedulers.cpp
@@ -38,6 +38,7 @@ limitations under the License.
 #include "osp/bsp/scheduler/LoadBalanceScheduler/VariancePartitioner.hpp"
 #include "osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp"
 #include "osp/bsp/scheduler/Serial.hpp"
+#include "osp/bsp/scheduler/MaxBspScheduler.hpp"
 #include "osp/coarser/Sarkar/SarkarMul.hpp"
 #include "osp/coarser/SquashA/SquashAMul.hpp"
 #include "osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp"
@@ -210,12 +211,6 @@ BOOST_AUTO_TEST_CASE(variancefillup_test) {
     run_test(&test);
 }
 
-BOOST_AUTO_TEST_CASE(greedyvariancesspscheduler_test){
-    GreedyVarianceSspScheduler<computational_dag_vector_impl_def_t> test;
-    run_test(&test);
-}
-
-
 BOOST_AUTO_TEST_CASE(etf_test_edge_desc_impl) {
 
     EtfScheduler<computational_dag_edge_idx_vector_impl_def_t> test;
@@ -367,4 +362,35 @@ BOOST_AUTO_TEST_CASE(SarkarMul_improver_test) {
     MultilevelCoarseAndSchedule<computational_dag_edge_idx_vector_impl_def_t, computational_dag_edge_idx_vector_impl_def_t> coarsen_test(sched, improver, ml_coarsen);
     
     run_test(&coarsen_test);
+}
+
+// Tests computeSchedule(BspSchedule&) → staleness = 1
+BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_test_vector_impl) {
+    GreedyVarianceSspScheduler<computational_dag_vector_impl_def_t> test;
+    run_test(&test);
+}
+
+// Tests computeSchedule(BspSchedule&) → staleness = 1 (different graph impl)
+BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_test_edge_idx_impl) {
+    GreedyVarianceSspScheduler<computational_dag_edge_idx_vector_impl_def_t> test;
+    run_test(&test);
+}
+
+// Tests computeSchedule(MaxBspSchedule&) → staleness = 2
+BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_MaxBspSchedule_integration) {
+    using Graph_t = computational_dag_edge_idx_vector_impl_def_int_t;
+    BspInstance<Graph_t> instance;
+    instance.setNumberOfProcessors(2);
+    auto &dag = instance.getComputationalDag();
+    dag.add_vertex(5, 1, 0);
+    dag.add_vertex(3, 1, 0);
+    dag.add_edge(0, 1);
+
+    GreedyVarianceSspScheduler<Graph_t> scheduler;
+    MaxBspSchedule<Graph_t> schedule(instance);
+    const auto result = scheduler.computeSchedule(schedule);
+
+    BOOST_CHECK_EQUAL(result, RETURN_STATUS::OSP_SUCCESS);
+    BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
+    BOOST_CHECK(schedule.numberOfSupersteps() >= 2);
 }
\ No newline at end of file

From ad3bad478b6793e94483e3f83c2e0b75457d31c1 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Wed, 22 Oct 2025 15:07:35 +0200
Subject: [PATCH 3/6] Fixing bugs

---
 .../GreedyVarianceSspScheduler.hpp            | 60 ++++++++---------
 include/osp/bsp/scheduler/Scheduler.hpp       | 21 ++++++
 tests/bsp_schedulers.cpp                      | 65 ++++++++++++++-----
 3 files changed, 100 insertions(+), 46 deletions(-)

diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
index 2a8b8d99..8a455347 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
@@ -166,8 +166,6 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                             p = i;
                             found_allocation = true;
 
-                            procReady[i].erase(it);
-
                             if (procType < procTypesCompatibleWithNodeType_skip_proctype.size()) {
                                 const auto &compatibleTypes =
                                     procTypesCompatibleWithNodeType_skip_proctype[procType]
@@ -191,8 +189,6 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                         p = i;
                         found_allocation = true;
 
-                        procReady[i].erase(it);
-
                         if (procType < procTypesCompatibleWithNodeType_skip_proctype.size()) {
                             const auto &compatibleTypes =
                                 procTypesCompatibleWithNodeType_skip_proctype[procType]
@@ -243,8 +239,6 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                             node = it->first;
                             p = i;
 
-                            readyList.erase(it);
-
                             const auto &compatibleTypes =
                                 procTypesCompatibleWithNodeType_skip_proctype[procType]
                                     [instance.getComputationalDag().vertex_type(node)];
@@ -260,8 +254,6 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                         node = it->first;
                         p = i;
 
-                        readyList.erase(it);
-
                         const auto &compatibleTypes =
                             procTypesCompatibleWithNodeType_skip_proctype[procType]
                                 [instance.getComputationalDag().vertex_type(node)];
@@ -270,11 +262,10 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                             if (otherType < allReady.size())
                                 allReady[otherType].erase(std::make_pair(node, work_variance[node]));
                         }
-
+                        
                         return;
                     }
                 }
-
                 ++it;
             }
         }
@@ -364,20 +355,26 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
         return nr_nodes;
     }
 
-
     public:
 
-    RETURN_STATUS computeSspSchedule(BspSchedule<Graph_t> &schedule, unsigned stale) {
+    /**
+    * @brief Default constructor for GreedyVarianceSspScheduler.
+    */
+    GreedyVarianceSspScheduler(float max_percent_idle_processors_ = 0.2f, bool increase_parallelism_in_new_superstep_ = true)
+        : max_percent_idle_processors(max_percent_idle_processors_),
+          increase_parallelism_in_new_superstep(increase_parallelism_in_new_superstep_) {}
 
+    /**
+    * @brief Default destructor for GreedyVarianceSspScheduler.
+    */
+    virtual ~GreedyVarianceSspScheduler() = default;
+
+    RETURN_STATUS computeSspSchedule(BspSchedule<Graph_t> &schedule, unsigned stale) {
         const auto &instance = schedule.getInstance();
         const auto &G = instance.getComputationalDag();
-        const auto &N = instance.numberOfVertices();
+        const VertexType &N = instance.numberOfVertices();
         const unsigned &P = instance.numberOfProcessors();
 
-        for (auto v : G.vertices()) {
-            schedule.setAssignedProcessor(v, std::numeric_limits<unsigned>::max());
-        }
-        
         unsigned supstepIdx = 0;
 
         if constexpr (is_memory_constraint_v<MemoryConstraint_t>) {
@@ -407,10 +404,13 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
             ++nr_procs_per_type[instance.getArchitecture().processorType(proc)];
         }
 
-        std::vector<unsigned> nrPredecRemain(N);
-        for (VertexType node = 0; node < static_cast<VertexType>(N); ++node) {
+        std::vector<VertexType> nrPredecRemain(N);
+
+        for (VertexType node = 0; node < N; ++node) {
             const auto num_parents = G.in_degree(node);
-            nrPredecRemain[node] = static_cast<unsigned>(num_parents);
+
+            nrPredecRemain[node] = num_parents;
+
             if (num_parents == 0) {
                 ready[0].insert(std::make_pair(node, work_variances[node]));
                 nr_ready_stale_nodes_per_type[0][G.vertex_type(node)]++;
@@ -426,9 +426,6 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
         std::vector<unsigned> number_of_allocated_allReady_tasks_in_superstep(instance.getArchitecture().getNumberOfProcessorTypes(), 0);
         std::vector<unsigned> limit_of_number_of_allocated_allReady_tasks_in_superstep(instance.getArchitecture().getNumberOfProcessorTypes(), 0);
 
-        
-
-
         bool endSupStep = true;
         bool begin_outer_while = true;
         bool able_to_schedule_in_step = false;
@@ -478,12 +475,11 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                 }
 
                 for (unsigned procType = 0; procType < instance.getArchitecture().getNumberOfProcessorTypes(); procType++) {
-                    unsigned equal_split = (static_cast<unsigned int>(allReady[procType].size()) + stale - 1) / stale;
+                    unsigned equal_split = (static_cast<unsigned>(allReady[procType].size()) + stale - 1) / stale;
                     unsigned at_least_for_long_step = 3 * nr_procs_per_type[procType];
-
                     limit_of_number_of_allocated_allReady_tasks_in_superstep[procType] = std::max(at_least_for_long_step, equal_split);
                 }
-
+            
                 endSupStep = false;
                 finishTimes.emplace(0, std::numeric_limits<VertexType>::max());
             }
@@ -508,8 +504,7 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                             unsigned earliest_add = supstepIdx;
                             for (const auto& pred : G.parents(succ)) {
                                 if (schedule.assignedProcessor(pred) != proc_of_node) {
-                                    earliest_add = std::max(earliest_add,
-                                                            stale + schedule.assignedSuperstep(pred));
+                                    earliest_add = std::max(earliest_add, stale + schedule.assignedSuperstep(pred));
                                 }
                             }
 
@@ -542,6 +537,7 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
             if (!CanChooseNode(instance, allReady, procReady[supstepIdx % stale], procFree)) {
                 endSupStep = true;
             }
+
             while (CanChooseNode(instance, allReady, procReady[supstepIdx % stale], procFree)) {
                 VertexType nextNode = std::numeric_limits<VertexType>::max();
                 unsigned nextProc = P;
@@ -567,6 +563,7 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                     nr_old_ready_nodes_per_type[G.vertex_type(nextNode)]--;
                     const unsigned nextProcType = instance.getArchitecture().processorType(nextProc);
                     number_of_allocated_allReady_tasks_in_superstep[nextProcType]++;
+                    
                     if (number_of_allocated_allReady_tasks_in_superstep[nextProcType] >= limit_of_number_of_allocated_allReady_tasks_in_superstep[nextProcType]) {
                         allReady[nextProcType].clear();
                     }
@@ -575,6 +572,7 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
                 for (size_t i = 0; i < stale; i++) {
                     ready[i].erase(std::make_pair(nextNode, work_variances[nextNode]));
                 }
+
                 old_ready.erase(std::make_pair(nextNode, work_variances[nextNode]));
 
                 schedule.setAssignedProcessor(nextNode, nextProc);
@@ -605,7 +603,7 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
             else if (++successive_empty_supersteps > 100 + stale)
                 return RETURN_STATUS::ERROR;
 
-            if (free > static_cast<decltype(free)>(P * max_percent_idle_processors) &&
+            if (free > (P * max_percent_idle_processors) &&
                 ((!increase_parallelism_in_new_superstep) ||
                 get_nr_parallelizable_nodes(
                     instance, stale, nr_old_ready_nodes_per_type,
@@ -626,10 +624,12 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
     }
 
     RETURN_STATUS computeSchedule(BspSchedule<Graph_t> &schedule) override {
+        std::cout<< "BspSchedule"<< std::endl;
         return computeSspSchedule(schedule, 1U);
     }
 
-    RETURN_STATUS computeSchedule(MaxBspSchedule<Graph_t> &schedule) {
+    RETURN_STATUS computeSchedule(MaxBspSchedule<Graph_t> &schedule) override {
+        std::cout<< "MaxBspSchedule"<< std::endl;
         return computeSspSchedule(schedule, 2U);
     }
 
diff --git a/include/osp/bsp/scheduler/Scheduler.hpp b/include/osp/bsp/scheduler/Scheduler.hpp
index dd06d187..2b275271 100644
--- a/include/osp/bsp/scheduler/Scheduler.hpp
+++ b/include/osp/bsp/scheduler/Scheduler.hpp
@@ -22,6 +22,7 @@ limitations under the License.
 #include <iostream>
 #include "osp/bsp/model/BspInstance.hpp"
 #include "osp/bsp/model/BspSchedule.hpp"
+#include "osp/bsp/model/MaxBspSchedule.hpp"
 #include "osp/bsp/model/BspScheduleCS.hpp"
 #include "osp/concepts/computational_dag_concept.hpp"
 
@@ -93,6 +94,26 @@ class Scheduler {
      */
     virtual RETURN_STATUS computeSchedule(BspSchedule<Graph_t> &schedule) = 0;
 
+    /**
+     * @brief Compute a Max-BSP (Stale Synchronous Parallel) schedule for the given instance.
+     *
+     * This overload allows schedulers that support the Max-BSP or SSP model
+     * to implement asynchronous-like scheduling with configurable staleness.
+     * By default, this base implementation throws an exception, since not all
+     * schedulers provide Max-BSP support. Derived schedulers such as
+     * GreedyVarianceSspScheduler should override this function to perform
+     * their custom SSP scheduling logic.
+     *
+     * @param schedule Reference to a MaxBspSchedule object representing
+     *        the schedule to be computed and populated.
+     * @return A RETURN_STATUS code indicating the success or failure of the computation.
+     * @throws std::runtime_error If the scheduler does not implement Max-BSP scheduling.
+     */
+    virtual RETURN_STATUS computeSchedule(MaxBspSchedule<Graph_t> &schedule) {
+        (void)schedule;
+        throw std::runtime_error("Not implemented for this scheduler");
+    }
+
     virtual RETURN_STATUS computeScheduleCS(BspScheduleCS<Graph_t> &schedule) {
 
         auto result = computeSchedule(schedule);
diff --git a/tests/bsp_schedulers.cpp b/tests/bsp_schedulers.cpp
index d05b3b66..22e3e805 100644
--- a/tests/bsp_schedulers.cpp
+++ b/tests/bsp_schedulers.cpp
@@ -149,6 +149,51 @@ void run_test_2(Scheduler<Graph_t> *test_scheduler) {
     }
 };
 
+template<typename Graph_t>
+void run_test_max_bsp(Scheduler<Graph_t>* test_scheduler) {
+    std::vector<std::string> filenames_graph = tiny_spaa_graphs();
+    std::vector<std::string> filenames_architectures = test_architectures();
+
+    // Locate project root
+    std::filesystem::path cwd = std::filesystem::current_path();
+    while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) {
+        cwd = cwd.parent_path();
+    }
+
+    for (auto& filename_graph : filenames_graph) {
+        for (auto& filename_machine : filenames_architectures) {
+            std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1);
+            name_graph = name_graph.substr(0, name_graph.find_last_of("."));
+            std::string name_machine = filename_machine.substr(filename_machine.find_last_of("/\\") + 1);
+            name_machine = name_machine.substr(0, name_machine.rfind("."));
+
+            std::cout << std::endl
+                      << "Scheduler (MaxBsp): " << test_scheduler->getScheduleName() << std::endl
+                      << "Graph: " << name_graph << std::endl
+                      << "Architecture: " << name_machine << std::endl;
+
+            computational_dag_edge_idx_vector_impl_def_int_t graph;
+            BspArchitecture<Graph_t> arch;
+
+            bool status_graph = file_reader::readGraph((cwd / filename_graph).string(), graph);
+            bool status_architecture =
+                file_reader::readBspArchitecture((cwd / filename_machine).string(), arch);
+
+            BOOST_REQUIRE_MESSAGE(status_graph, "Failed to read graph: " << filename_graph);
+            BOOST_REQUIRE_MESSAGE(status_architecture, "Failed to read architecture: " << filename_machine);
+
+            BspInstance<Graph_t> instance(graph, arch);
+
+            MaxBspSchedule<Graph_t> schedule(instance);
+
+            const auto result = test_scheduler->computeSchedule(schedule);
+
+            BOOST_CHECK_EQUAL(result, RETURN_STATUS::OSP_SUCCESS);
+            BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
+        }
+    }
+}
+
 BOOST_AUTO_TEST_CASE(GreedyBspScheduler_test) {
 
     GreedyBspScheduler<computational_dag_vector_impl_def_t> test;
@@ -377,20 +422,8 @@ BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_test_edge_idx_impl) {
 }
 
 // Tests computeSchedule(MaxBspSchedule&) → staleness = 2
-BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_MaxBspSchedule_integration) {
+BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_MaxBspSchedule_large_test) {
     using Graph_t = computational_dag_edge_idx_vector_impl_def_int_t;
-    BspInstance<Graph_t> instance;
-    instance.setNumberOfProcessors(2);
-    auto &dag = instance.getComputationalDag();
-    dag.add_vertex(5, 1, 0);
-    dag.add_vertex(3, 1, 0);
-    dag.add_edge(0, 1);
-
-    GreedyVarianceSspScheduler<Graph_t> scheduler;
-    MaxBspSchedule<Graph_t> schedule(instance);
-    const auto result = scheduler.computeSchedule(schedule);
-
-    BOOST_CHECK_EQUAL(result, RETURN_STATUS::OSP_SUCCESS);
-    BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
-    BOOST_CHECK(schedule.numberOfSupersteps() >= 2);
-}
\ No newline at end of file
+    GreedyVarianceSspScheduler<Graph_t> test;
+    run_test_max_bsp(&test);
+}

From 66a2e9305799bb02d93977a1e9c7185621c4dd3f Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Mon, 17 Nov 2025 15:02:42 +0100
Subject: [PATCH 4/6] Fixes

---
 include/osp/bsp/model/BspSchedule.hpp         |   2 +-
 .../GreedyVarianceSspScheduler.hpp            |  10 +-
 include/osp/bsp/scheduler/MaxBspScheduler.hpp |  11 +-
 include/osp/bsp/scheduler/Scheduler.hpp       |  20 ---
 tests/CMakeLists.txt                          |   2 +
 tests/bsp_schedulers.cpp                      |  67 +-------
 tests/max_bsp_schedulers.cpp                  | 151 ++++++++++++++++++
 7 files changed, 164 insertions(+), 99 deletions(-)
 create mode 100644 tests/max_bsp_schedulers.cpp

diff --git a/include/osp/bsp/model/BspSchedule.hpp b/include/osp/bsp/model/BspSchedule.hpp
index b30cb26c..299e1716 100644
--- a/include/osp/bsp/model/BspSchedule.hpp
+++ b/include/osp/bsp/model/BspSchedule.hpp
@@ -65,7 +65,7 @@ class BspSchedule : public IBspSchedule<Graph_t>, public IBspScheduleEval<Graph_
     unsigned number_of_supersteps;
 
     std::vector<unsigned> node_to_processor_assignment;
-    std::vector<unsigned> node_to_superstep_assignment; 
+    std::vector<unsigned> node_to_superstep_assignment;
 
     template<unsigned staleness>
     inline bool satisfies_precedence_constraints_staleness() const {
diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
index 8a455347..0ac5c041 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
@@ -31,9 +31,7 @@ limitations under the License.
 
 #include "MemoryConstraintModules.hpp"
 #include "osp/auxiliary/misc.hpp"
-#include "osp/bsp/model/BspSchedule.hpp"
-#include "osp/bsp/model/MaxBspSchedule.hpp"
-#include "osp/bsp/scheduler/Scheduler.hpp"
+#include "osp/bsp/scheduler/MaxBspScheduler.hpp"
 #include "osp/graph_algorithms/directed_graph_top_sort.hpp"
 
 namespace osp {
@@ -45,7 +43,7 @@ namespace osp {
  * It computes schedules for BspInstance using variance-based priorities.
  */
 template<typename Graph_t, typename MemoryConstraint_t = no_memory_constraint>
-class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
+class GreedyVarianceSspScheduler : public MaxBspScheduler<Graph_t> {
 
     static_assert(is_computational_dag_v<Graph_t>, "GreedyVarianceSspScheduler can only be used with computational DAGs.");
 
@@ -111,7 +109,7 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
 
     struct VarianceCompare {
         bool operator()(const std::pair<VertexType, double> &lhs, const std::pair<VertexType, double> &rhs) const {
-            return ((lhs.second > rhs.second) || ((lhs.second == rhs.second) && (lhs.first < rhs.first)));
+            return ((lhs.second > rhs.second) || ((lhs.second >= rhs.second) && (lhs.first < rhs.first)));
         }
     };
 
@@ -624,12 +622,10 @@ class GreedyVarianceSspScheduler : public Scheduler<Graph_t> {
     }
 
     RETURN_STATUS computeSchedule(BspSchedule<Graph_t> &schedule) override {
-        std::cout<< "BspSchedule"<< std::endl;
         return computeSspSchedule(schedule, 1U);
     }
 
     RETURN_STATUS computeSchedule(MaxBspSchedule<Graph_t> &schedule) override {
-        std::cout<< "MaxBspSchedule"<< std::endl;
         return computeSspSchedule(schedule, 2U);
     }
 
diff --git a/include/osp/bsp/scheduler/MaxBspScheduler.hpp b/include/osp/bsp/scheduler/MaxBspScheduler.hpp
index 76f525f7..c6accf25 100644
--- a/include/osp/bsp/scheduler/MaxBspScheduler.hpp
+++ b/include/osp/bsp/scheduler/MaxBspScheduler.hpp
@@ -33,6 +33,7 @@ namespace osp {
  */
 template<typename Graph_t>
 class MaxBspScheduler : public Scheduler<Graph_t> {
+    public:
 
     static_assert(is_computational_dag_v<Graph_t>, "BspSchedule can only be used with computational DAGs.");
 
@@ -54,10 +55,11 @@ class MaxBspScheduler : public Scheduler<Graph_t> {
         return status;
     }
 
-    virtual RETURN_STATUS computeScheduleCS(BspScheduleCS<Graph_t> &schedule) {
-
-        auto result = computeSchedule(schedule);
+    virtual RETURN_STATUS computeScheduleCS(BspScheduleCS<Graph_t> &schedule) override {
+        MaxBspScheduleCS<Graph_t> tmpSchedule(schedule.getInstance());
+        auto result = computeScheduleCS(tmpSchedule);
         if (result == RETURN_STATUS::OSP_SUCCESS || result == RETURN_STATUS::BEST_FOUND) {
+            schedule = tmpSchedule;
             schedule.setAutoCommunicationSchedule();
             return result;
         } else {
@@ -73,7 +75,6 @@ class MaxBspScheduler : public Scheduler<Graph_t> {
     virtual RETURN_STATUS computeSchedule(MaxBspSchedule<Graph_t> &schedule) = 0;
 
     virtual RETURN_STATUS computeScheduleCS(MaxBspScheduleCS<Graph_t> &schedule) {
-// Fix me todo
         auto result = computeSchedule(schedule);
         if (result == RETURN_STATUS::OSP_SUCCESS || result == RETURN_STATUS::BEST_FOUND) {
             // schedule.setAutoCommunicationSchedule();
@@ -81,7 +82,7 @@ class MaxBspScheduler : public Scheduler<Graph_t> {
         } else {
             return RETURN_STATUS::ERROR;
         }
-    }
+    };
 };
 
 } // namespace osp
\ No newline at end of file
diff --git a/include/osp/bsp/scheduler/Scheduler.hpp b/include/osp/bsp/scheduler/Scheduler.hpp
index 2b275271..092d9946 100644
--- a/include/osp/bsp/scheduler/Scheduler.hpp
+++ b/include/osp/bsp/scheduler/Scheduler.hpp
@@ -94,26 +94,6 @@ class Scheduler {
      */
     virtual RETURN_STATUS computeSchedule(BspSchedule<Graph_t> &schedule) = 0;
 
-    /**
-     * @brief Compute a Max-BSP (Stale Synchronous Parallel) schedule for the given instance.
-     *
-     * This overload allows schedulers that support the Max-BSP or SSP model
-     * to implement asynchronous-like scheduling with configurable staleness.
-     * By default, this base implementation throws an exception, since not all
-     * schedulers provide Max-BSP support. Derived schedulers such as
-     * GreedyVarianceSspScheduler should override this function to perform
-     * their custom SSP scheduling logic.
-     *
-     * @param schedule Reference to a MaxBspSchedule object representing
-     *        the schedule to be computed and populated.
-     * @return A RETURN_STATUS code indicating the success or failure of the computation.
-     * @throws std::runtime_error If the scheduler does not implement Max-BSP scheduling.
-     */
-    virtual RETURN_STATUS computeSchedule(MaxBspSchedule<Graph_t> &schedule) {
-        (void)schedule;
-        throw std::runtime_error("Not implemented for this scheduler");
-    }
-
     virtual RETURN_STATUS computeScheduleCS(BspScheduleCS<Graph_t> &schedule) {
 
         auto result = computeSchedule(schedule);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 17975dde..8738643f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -122,6 +122,8 @@ endif()
 
 _add_test( bsp_schedulers )
 
+_add_test( max_bsp_schedulers )
+
 _add_test( bsp_schedulers_mem_const )
 
 _add_test( bsp_improvementschedulers )
diff --git a/tests/bsp_schedulers.cpp b/tests/bsp_schedulers.cpp
index 22e3e805..215e4e6b 100644
--- a/tests/bsp_schedulers.cpp
+++ b/tests/bsp_schedulers.cpp
@@ -38,7 +38,6 @@ limitations under the License.
 #include "osp/bsp/scheduler/LoadBalanceScheduler/VariancePartitioner.hpp"
 #include "osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp"
 #include "osp/bsp/scheduler/Serial.hpp"
-#include "osp/bsp/scheduler/MaxBspScheduler.hpp"
 #include "osp/coarser/Sarkar/SarkarMul.hpp"
 #include "osp/coarser/SquashA/SquashAMul.hpp"
 #include "osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp"
@@ -149,51 +148,6 @@ void run_test_2(Scheduler<Graph_t> *test_scheduler) {
     }
 };
 
-template<typename Graph_t>
-void run_test_max_bsp(Scheduler<Graph_t>* test_scheduler) {
-    std::vector<std::string> filenames_graph = tiny_spaa_graphs();
-    std::vector<std::string> filenames_architectures = test_architectures();
-
-    // Locate project root
-    std::filesystem::path cwd = std::filesystem::current_path();
-    while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) {
-        cwd = cwd.parent_path();
-    }
-
-    for (auto& filename_graph : filenames_graph) {
-        for (auto& filename_machine : filenames_architectures) {
-            std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1);
-            name_graph = name_graph.substr(0, name_graph.find_last_of("."));
-            std::string name_machine = filename_machine.substr(filename_machine.find_last_of("/\\") + 1);
-            name_machine = name_machine.substr(0, name_machine.rfind("."));
-
-            std::cout << std::endl
-                      << "Scheduler (MaxBsp): " << test_scheduler->getScheduleName() << std::endl
-                      << "Graph: " << name_graph << std::endl
-                      << "Architecture: " << name_machine << std::endl;
-
-            computational_dag_edge_idx_vector_impl_def_int_t graph;
-            BspArchitecture<Graph_t> arch;
-
-            bool status_graph = file_reader::readGraph((cwd / filename_graph).string(), graph);
-            bool status_architecture =
-                file_reader::readBspArchitecture((cwd / filename_machine).string(), arch);
-
-            BOOST_REQUIRE_MESSAGE(status_graph, "Failed to read graph: " << filename_graph);
-            BOOST_REQUIRE_MESSAGE(status_architecture, "Failed to read architecture: " << filename_machine);
-
-            BspInstance<Graph_t> instance(graph, arch);
-
-            MaxBspSchedule<Graph_t> schedule(instance);
-
-            const auto result = test_scheduler->computeSchedule(schedule);
-
-            BOOST_CHECK_EQUAL(result, RETURN_STATUS::OSP_SUCCESS);
-            BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
-        }
-    }
-}
-
 BOOST_AUTO_TEST_CASE(GreedyBspScheduler_test) {
 
     GreedyBspScheduler<computational_dag_vector_impl_def_t> test;
@@ -407,23 +361,4 @@ BOOST_AUTO_TEST_CASE(SarkarMul_improver_test) {
     MultilevelCoarseAndSchedule<computational_dag_edge_idx_vector_impl_def_t, computational_dag_edge_idx_vector_impl_def_t> coarsen_test(sched, improver, ml_coarsen);
     
     run_test(&coarsen_test);
-}
-
-// Tests computeSchedule(BspSchedule&) → staleness = 1
-BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_test_vector_impl) {
-    GreedyVarianceSspScheduler<computational_dag_vector_impl_def_t> test;
-    run_test(&test);
-}
-
-// Tests computeSchedule(BspSchedule&) → staleness = 1 (different graph impl)
-BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_test_edge_idx_impl) {
-    GreedyVarianceSspScheduler<computational_dag_edge_idx_vector_impl_def_t> test;
-    run_test(&test);
-}
-
-// Tests computeSchedule(MaxBspSchedule&) → staleness = 2
-BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_MaxBspSchedule_large_test) {
-    using Graph_t = computational_dag_edge_idx_vector_impl_def_int_t;
-    GreedyVarianceSspScheduler<Graph_t> test;
-    run_test_max_bsp(&test);
-}
+}
\ No newline at end of file
diff --git a/tests/max_bsp_schedulers.cpp b/tests/max_bsp_schedulers.cpp
new file mode 100644
index 00000000..dbbd1da2
--- /dev/null
+++ b/tests/max_bsp_schedulers.cpp
@@ -0,0 +1,151 @@
+/*
+Copyright 2024 Huawei Technologies Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+*/
+
+#define BOOST_TEST_MODULE BSP_SCHEDULERS
+#include <boost/test/unit_test.hpp>
+
+#include <filesystem>
+#include <string>
+#include <vector>
+
+
+#include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp"
+#include "osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp"
+#include "osp/bsp/scheduler/Serial.hpp"
+#include "osp/bsp/scheduler/MaxBspScheduler.hpp"
+#include "osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp"
+#include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
+#include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp"
+#include "osp/auxiliary/io/arch_file_reader.hpp"
+#include "osp/auxiliary/io/hdag_graph_file_reader.hpp"
+#include "osp/auxiliary/io/general_file_reader.hpp"
+#include "test_graphs.hpp"
+
+using namespace osp;
+
+std::vector<std::string> test_architectures() { return {"data/machine_params/p3.arch"}; }
+
+template<typename Graph_t>
+void run_test(Scheduler<Graph_t> *test_scheduler) {
+    // static_assert(std::is_base_of<Scheduler, T>::value, "Class is not a scheduler!");
+    std::vector<std::string> filenames_graph = tiny_spaa_graphs();
+    std::vector<std::string> filenames_architectures = test_architectures();
+
+    // Getting root git directory
+    std::filesystem::path cwd = std::filesystem::current_path();
+    std::cout << cwd << std::endl;
+    while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) {
+        cwd = cwd.parent_path();
+        std::cout << cwd << std::endl;
+    }
+
+    for (auto &filename_graph : filenames_graph) {
+        for (auto &filename_machine : filenames_architectures) {
+            std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1);
+            name_graph = name_graph.substr(0, name_graph.find_last_of("."));
+            std::string name_machine = filename_machine.substr(filename_machine.find_last_of("/\\") + 1);
+            name_machine = name_machine.substr(0, name_machine.rfind("."));
+
+            std::cout << std::endl << "Scheduler: " << test_scheduler->getScheduleName() << std::endl;
+            std::cout << "Graph: " << name_graph << std::endl;
+            std::cout << "Architecture: " << name_machine << std::endl;
+
+            BspInstance<Graph_t> instance;
+
+            bool status_graph = file_reader::readGraph((cwd / filename_graph).string(),
+                                                                                instance.getComputationalDag());
+            bool status_architecture = file_reader::readBspArchitecture((cwd / "data/machine_params/p3.arch").string(),
+                                                                        instance.getArchitecture());
+
+            if (!status_graph || !status_architecture) {
+
+                std::cout << "Reading files failed." << std::endl;
+                BOOST_CHECK(false);
+            }
+
+            BspSchedule<Graph_t> schedule(instance);
+            const auto result = test_scheduler->computeSchedule(schedule);
+
+            BOOST_CHECK_EQUAL(RETURN_STATUS::OSP_SUCCESS, result);
+            BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
+        }
+    }
+};
+
+template<typename Graph_t>
+void run_test_max_bsp(MaxBspScheduler<Graph_t>* test_scheduler) {
+    std::vector<std::string> filenames_graph = tiny_spaa_graphs();
+    std::vector<std::string> filenames_architectures = test_architectures();
+
+    // Locate project root
+    std::filesystem::path cwd = std::filesystem::current_path();
+    while ((!cwd.empty()) && (cwd.filename() != "OneStopParallel")) {
+        cwd = cwd.parent_path();
+    }
+
+    for (auto& filename_graph : filenames_graph) {
+        for (auto& filename_machine : filenames_architectures) {
+            std::string name_graph = filename_graph.substr(filename_graph.find_last_of("/\\") + 1);
+            name_graph = name_graph.substr(0, name_graph.find_last_of("."));
+            std::string name_machine = filename_machine.substr(filename_machine.find_last_of("/\\") + 1);
+            name_machine = name_machine.substr(0, name_machine.rfind("."));
+
+            std::cout << std::endl
+                      << "Scheduler (MaxBsp): " << test_scheduler->getScheduleName() << std::endl
+                      << "Graph: " << name_graph << std::endl
+                      << "Architecture: " << name_machine << std::endl;
+
+            computational_dag_edge_idx_vector_impl_def_int_t graph;
+            BspArchitecture<Graph_t> arch;
+
+            bool status_graph = file_reader::readGraph((cwd / filename_graph).string(), graph);
+            bool status_architecture =
+                file_reader::readBspArchitecture((cwd / filename_machine).string(), arch);
+
+            BOOST_REQUIRE_MESSAGE(status_graph, "Failed to read graph: " << filename_graph);
+            BOOST_REQUIRE_MESSAGE(status_architecture, "Failed to read architecture: " << filename_machine);
+
+            BspInstance<Graph_t> instance(graph, arch);
+
+            MaxBspSchedule<Graph_t> schedule(instance);
+
+            const auto result = test_scheduler->computeSchedule(schedule);
+
+            BOOST_CHECK_EQUAL(result, RETURN_STATUS::OSP_SUCCESS);
+            BOOST_CHECK(schedule.satisfiesPrecedenceConstraints());
+        }
+    }
+}
+
+// Tests computeSchedule(BspSchedule&) → staleness = 1
+BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_test_vector_impl) {
+    GreedyVarianceSspScheduler<computational_dag_vector_impl_def_t> test;
+    run_test(&test);
+}
+
+// Tests computeSchedule(BspSchedule&) → staleness = 1 (different graph impl)
+BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_test_edge_idx_impl) {
+    GreedyVarianceSspScheduler<computational_dag_edge_idx_vector_impl_def_t> test;
+    run_test(&test);
+}
+
+// Tests computeSchedule(MaxBspSchedule&) → staleness = 2
+BOOST_AUTO_TEST_CASE(GreedyVarianceSspScheduler_MaxBspSchedule_large_test) {
+    GreedyVarianceSspScheduler<computational_dag_edge_idx_vector_impl_def_int_t> test;
+    run_test_max_bsp(&test);
+}

From c998da845dcc8bba0728ed677216d8070ca31777 Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Mon, 17 Nov 2025 15:29:36 +0100
Subject: [PATCH 5/6] Fix cast warning

---
 .../GreedySchedulers/GreedyVarianceSspScheduler.hpp      | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
index 0ac5c041..7a6c454d 100644
--- a/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
+++ b/include/osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp
@@ -75,7 +75,14 @@ class GreedyVarianceSspScheduler : public MaxBspScheduler<Graph_t> {
             }
             temp = std::log(temp) / 2 + max_priority;
 
-            double node_weight = std::log((double)std::max(graph.vertex_work_weight(*r_iter), static_cast<v_workw_t<Graph_t>>(1)));
+            double node_weight = std::log(
+                static_cast<double>(
+                    std::max(
+                        graph.vertex_work_weight(*r_iter),
+                        static_cast<v_workw_t<Graph_t>>(1)
+                    )
+                )
+            );
             double larger_val = node_weight > temp ? node_weight : temp;
 
             work_variance[*r_iter] =

From a2ed355d6238b88c1ca9a4936805faf56d2bfdeb Mon Sep 17 00:00:00 2001
From: Christos Konstantinos Matzoros
 <christos.konstantinos.matzoros@h-partners.com>
Date: Mon, 17 Nov 2025 17:37:39 +0100
Subject: [PATCH 6/6] Minor fixes

---
 include/osp/bsp/scheduler/Scheduler.hpp | 1 -
 tests/bsp_schedulers.cpp                | 1 -
 tests/max_bsp_schedulers.cpp            | 4 +---
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/osp/bsp/scheduler/Scheduler.hpp b/include/osp/bsp/scheduler/Scheduler.hpp
index 092d9946..dd06d187 100644
--- a/include/osp/bsp/scheduler/Scheduler.hpp
+++ b/include/osp/bsp/scheduler/Scheduler.hpp
@@ -22,7 +22,6 @@ limitations under the License.
 #include <iostream>
 #include "osp/bsp/model/BspInstance.hpp"
 #include "osp/bsp/model/BspSchedule.hpp"
-#include "osp/bsp/model/MaxBspSchedule.hpp"
 #include "osp/bsp/model/BspScheduleCS.hpp"
 #include "osp/concepts/computational_dag_concept.hpp"
 
diff --git a/tests/bsp_schedulers.cpp b/tests/bsp_schedulers.cpp
index 215e4e6b..936bad69 100644
--- a/tests/bsp_schedulers.cpp
+++ b/tests/bsp_schedulers.cpp
@@ -33,7 +33,6 @@ limitations under the License.
 #include "osp/bsp/scheduler/GreedySchedulers/GrowLocalAutoCoresParallel.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/RandomGreedy.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/VarianceFillup.hpp"
-#include "osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp"
 #include "osp/bsp/scheduler/LoadBalanceScheduler/LightEdgeVariancePartitioner.hpp"
 #include "osp/bsp/scheduler/LoadBalanceScheduler/VariancePartitioner.hpp"
 #include "osp/bsp/scheduler/LocalSearch/HillClimbing/hill_climbing.hpp"
diff --git a/tests/max_bsp_schedulers.cpp b/tests/max_bsp_schedulers.cpp
index dbbd1da2..a3a9aeda 100644
--- a/tests/max_bsp_schedulers.cpp
+++ b/tests/max_bsp_schedulers.cpp
@@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
-@author Toni Boehnlein, Benjamin Lozes, Pal Andras Papp, Raphael S. Steiner
+@author Toni Boehnlein, Christos Matzoros, Pal Andras Papp, Raphael S. Steiner
 */
 
 #define BOOST_TEST_MODULE BSP_SCHEDULERS
@@ -26,9 +26,7 @@ limitations under the License.
 
 #include "osp/bsp/scheduler/GreedySchedulers/GreedyBspScheduler.hpp"
 #include "osp/bsp/scheduler/GreedySchedulers/GreedyVarianceSspScheduler.hpp"
-#include "osp/bsp/scheduler/Serial.hpp"
 #include "osp/bsp/scheduler/MaxBspScheduler.hpp"
-#include "osp/graph_implementations/adj_list_impl/compact_sparse_graph.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_edge_idx_vector_impl.hpp"
 #include "osp/graph_implementations/adj_list_impl/computational_dag_vector_impl.hpp"
 #include "osp/auxiliary/io/arch_file_reader.hpp"