diff --git a/.gitignore b/.gitignore
index e830c1d3c9..ad0f998716 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,6 +56,19 @@ tags
 poetry.lock
 *.code-workspace
 .env
+*.vim
+
+# Cmake files
+**/CMakeFiles
+**/cmake_install.cmake
+**/CMakeCache.txt
+**/compile_commands.json
+**/.cache
+**/build
+**/_deps
+finn_xsi/finn_xsi/unittests/*.cmake
+finn_xsi/finn_xsi/unittests/Makefile
+
 settings.yaml
 */.cache/*
 
@@ -100,6 +113,9 @@ MANIFEST
 /data/
 *.csv
 
+# Mock templated simulation config
+finn_xsi/finn_xsi/rtlsim_config.hpp
+
 # Google Drive key for dashboard
 /gdrive-key/
 
@@ -108,7 +124,7 @@ MANIFEST
 
 # downloaded dep repos
 /deps/
-/finn_deps/
+finn_deps/
 
 # local test directories for benchmarking infrastructure
 bench_input
diff --git a/finn-rtllib/removedatapath/hdl/dummy_template.v b/finn-rtllib/removedatapath/hdl/dummy_template.v
new file mode 100644
index 0000000000..36dec63915
--- /dev/null
+++ b/finn-rtllib/removedatapath/hdl/dummy_template.v
@@ -0,0 +1,25 @@
+module $TOP_MODULE_NAME$(
+//- Global Control ------------------
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out0_V, ASSOCIATED_RESET = ap_rst_n" *)
+(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+input   ap_clk,
+(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+input   ap_rst_n,
+
+//- AXI Stream - Input --------------
+output   in0_V_TREADY,
+input   in0_V_TVALID,
+input  [$WIDTH$-1:0] in0_V_TDATA,
+
+//- AXI Stream - Output --------------
+input   out0_V_TREADY,
+output   out0_V_TVALID,
+output  [$WIDTH$-1:0] out0_V_TDATA
+);
+
+assign	in0_V_TREADY = out0_V_TREADY;
+assign	out0_V_TVALID = in0_V_TVALID;
+assign	out0_V_TDATA = 0;
+
+
+endmodule
diff --git a/finn_xsi/finn_xsi/.clang-format b/finn_xsi/finn_xsi/.clang-format
new file mode 100644
index 0000000000..2df30c132e
--- /dev/null
+++ b/finn_xsi/finn_xsi/.clang-format
@@ -0,0 +1,46 @@
+BasedOnStyle: Chromium
+AccessModifierOffset: '1'
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: 'true'
+AlignTrailingComments: 'true'
+AllowAllArgumentsOnNextLine: 'true'
+AllowShortBlocksOnASingleLine: 'true'
+AllowShortFunctionsOnASingleLine: 'true'
+AllowShortCaseLabelsOnASingleLine: 'false'
+AlwaysBreakTemplateDeclarations: 'Yes'
+BinPackParameters: 'true'
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: true
+ColumnLimit: '180'
+Cpp11BracedListStyle: 'true'
+FixNamespaceComments: 'true'
+IndentCaseLabels: 'true'
+IndentPPDirectives: BeforeHash
+IndentWidth: '4'
+IndentWrappedFunctionNames: 'true'
+IncludeBlocks: Regroup
+KeepEmptyLinesAtTheStartOfBlocks: 'false'
+Language: Cpp
+MaxEmptyLinesToKeep: '2'
+NamespaceIndentation: All
+PointerAlignment: Left
+ReflowComments: 'true'
+SortIncludes: 'true'
+SortUsingDeclarations: 'true'
+SpaceAfterCStyleCast: 'true'
+SpaceAfterLogicalNot: 'false'
+SpaceAfterTemplateKeyword: 'false'
+SpaceBeforeCpp11BracedList: 'false'
+SpaceBeforeCtorInitializerColon: 'true'
+SpaceBeforeInheritanceColon: 'true'
+SpaceInEmptyParentheses: 'false'
+SpacesInAngles: 'false'
+SpacesInCStyleCastParentheses: 'false'
+SpacesInContainerLiterals: 'false'
+SpacesInParentheses: 'false'
+SpacesInSquareBrackets: 'false'
+TabWidth: '4'
+---
+Language: Json
+BasedOnStyle: llvm
diff --git a/finn_xsi/finn_xsi/CMakeLists.txt b/finn_xsi/finn_xsi/CMakeLists.txt
new file mode 100644
index 0000000000..5f4daa4c12
--- /dev/null
+++ b/finn_xsi/finn_xsi/CMakeLists.txt
@@ -0,0 +1,134 @@
+cmake_minimum_required(VERSION 3.11)
+project(LayerSimulationBackend)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+# Require C++20
+set(CMAKE_CXX_EXTENSIONS ON)
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+message(STATUS "Using C++ Standard ${CMAKE_CXX_STANDARD}")
+SET(CMAKE_COLOR_MAKEFILE ON)
+
+message(STATUS "CMake cwd: ${CMAKE_CURRENT_SOURCE_DIR}")
+
+# Export compile commands for clangd
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# INCLUDES
+
+#Threads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+
+#OpenMP
+find_package(OpenMP REQUIRED)
+
+#Compiler Options
+add_library(fifosim_options INTERFACE)
+add_library(fifosim::options ALIAS fifosim_options)
+
+OPTION(FIFOSIM_ENABLE_ALLOPT "Enable all optimizations" ON)
+if(${FIFOSIM_ENABLE_ALLOPT})
+  message(STATUS "All optimizations are enabled")
+  target_compile_options(
+    fifosim_options
+    INTERFACE -Ofast -ffast-math -march=native -mtune=native -fstack-protector-strong -fopenmp -ffunction-sections -fdata-sections -pipe -funroll-loops -shared -fPIC -Wno-interference-size
+    # Additional performance options:
+    -flto=auto                    # Link-time optimization (auto-detect thread count)
+    -fno-plt                      # Avoid PLT for better performance with shared libs
+    -fno-semantic-interposition   # Allow more aggressive optimization in shared libs
+    -ftree-vectorize              # Enable auto-vectorization (usually on with -O3)
+    -fvect-cost-model=dynamic     # Better vectorization cost model
+    -fprefetch-loop-arrays        # Prefetch arrays in loops
+    -fno-math-errno               # Don't set errno for math functions (covered by -ffast-math mostly)
+    -fno-trapping-math            # Allow optimizations that may trap (part of -ffast-math)
+    -ffinite-math-only            # Assume no NaN/Inf (part of -ffast-math)
+    -fassociative-math            # Allow reassociation (part of -ffast-math)
+    )
+    target_link_options(
+    fifosim_options
+    INTERFACE
+    -flto=auto                    # LTO at link time
+    -Wl,--gc-sections             # Remove unused sections
+    -Wl,--as-needed               # Only link needed libraries
+    -Wl,-O3                       # Linker optimization level
+    -Wl,--hash-style=gnu          # Faster symbol lookup
+)
+  #target_link_options(fifosim_options INTERFACE -fsanitize=undefined,address)
+endif()
+
+### Enable compiler warnings
+option(FIFOSIM_ENABLE_WARNINGS "Enable warnings" ON)
+if (FIFOSIM_ENABLE_WARNINGS)
+  include(cmake/CompilerWarnings.cmake)
+  fifosim_set_project_warnings(
+    fifosim_options
+    OFF
+    ""
+    ""
+    ""
+    "")
+endif (FIFOSIM_ENABLE_WARNINGS)
+
+# Use ccache if available
+find_program(CCACHE_PROGRAM ccache)
+if(CCACHE_PROGRAM)
+    message(STATUS "Using ccache for builds")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+endif()
+
+#
+# Create options for including cmake files from the cmake folder with a bit of output.
+#
+macro(check_include)
+  if(NOT ${ARGC} EQUAL 3)
+    message(FATAL_ERROR "Call to 'check_include' with ${ARGC} arguments instead of 3")
+  endif()
+  OPTION(${ARGV0} "Enable ${ARGV0}" ON)
+  if (${ARGV0})
+    message(STATUS "${ARGV1}: enabled")
+    include(cmake/${ARGV2})
+  else()
+    message(STATUS "${ARGV1}: disabled")
+  endif()
+endmacro()
+
+message(STATUS "Checks:")
+list(APPEND CMAKE_MESSAGE_INDENT "  ") #indent +1
+check_include(FIFOSIM_IPO          "InterproceduralOptimization" InterproceduralOptimization.cmake)
+list(POP_BACK CMAKE_MESSAGE_INDENT)    #indent -1
+
+# Collect source files
+file(GLOB_RECURSE CORE_SRC src/*.cpp)
+
+# For JSON writing
+include(FetchContent)
+FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.12.0/json.tar.xz)
+FetchContent_MakeAvailable(json)
+
+# Add boost for PO
+find_package(Boost COMPONENTS program_options REQUIRED)
+
+# Build the simulation library
+add_library(SimulationBackendLib SHARED ${CORE_SRC})
+target_include_directories(SimulationBackendLib PUBLIC "${CMAKE_BINARY_DIR}") # Include the rtlsim wrapper directory itself
+target_include_directories(SimulationBackendLib PUBLIC "$ENV{XILINX_VIVADO}/data/xsim/include") # Add xsim includes
+target_include_directories(SimulationBackendLib PUBLIC "include")
+target_link_libraries(SimulationBackendLib PUBLIC fifosim::options nlohmann_json::nlohmann_json Threads::Threads OpenMP::OpenMP_CXX -ldl -lrt)
+
+# Build the executable for connected simulations
+add_executable(LayerSimulationBackend LayerSimulationBackend.cpp)
+target_include_directories(LayerSimulationBackend SYSTEM PUBLIC ${Boost_INCLUDE_DIRS})
+target_link_libraries(LayerSimulationBackend SimulationBackendLib Boost::program_options)
+
+# Build the executable for isolated simulations
+add_executable(IsolatedSimulationBackend IsolatedSimulationBackend.cpp)
+target_include_directories(IsolatedSimulationBackend SYSTEM PUBLIC ${Boost_INCLUDE_DIRS})
+target_link_libraries(IsolatedSimulationBackend SimulationBackendLib Boost::program_options)
+
+OPTION(ENABLE_UNITTESTS "Enable unittests" OFF)
+if(${ENABLE_UNITTESTS})
+add_subdirectory(unittests)
+endif()
diff --git a/finn_xsi/finn_xsi/IsolatedSimulationBackend.cpp b/finn_xsi/finn_xsi/IsolatedSimulationBackend.cpp
new file mode 100644
index 0000000000..92de4a1772
--- /dev/null
+++ b/finn_xsi/finn_xsi/IsolatedSimulationBackend.cpp
@@ -0,0 +1,153 @@
+#include <IsolatedSimulation.hpp>
+#include <boost/program_options.hpp>
+#include <SocketServer.h>
+#include <chrono>
+#include <rtlsim_config.hpp>
+#include <thread>
+
+namespace po = boost::program_options;
+
+
+std::string getTime() {
+    auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+    auto formatted = std::put_time(std::localtime(&now), "[%T]");
+    std::stringstream ss;
+    ss << formatted;
+    return ss.str();
+}
+
+
+int main(int argc, const char* argv[]) {
+    // Parse CLI options
+    po::options_description desc{"Options"};
+    desc.add_options()("socket,s", po::value<std::string>(), "Unix domain socket path for IPC");
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    // Create simulation
+    IsolatedSimulation<RTLSimConfig::istream_descs.size(), RTLSimConfig::ostream_descs.size()> sim(
+        RTLSimConfig::kernel_libname,
+        RTLSimConfig::design_libname,
+        "xsim_log_file.txt",
+        "trace_file.wdb",
+        RTLSimConfig::istream_descs,
+        RTLSimConfig::ostream_descs
+    );
+
+
+
+    // Create controller
+    if (vm.count("socket")) {
+        const std::string socket_path = vm["socket"].as<std::string>();
+        std::cout << "Initializing socket server at: " << socket_path << std::endl;
+        std::cout.flush();
+
+        SocketServer server(socket_path);
+        if (auto error = server.initialize(); error.has_value()) {
+            std::cerr << "Failed to initialize socket server: " << *error << std::endl;
+            std::cerr.flush();
+            return 1;
+        }
+
+        std::cout << "Socket server initialized, waiting for commands..." << std::endl;
+        std::cout.flush();
+
+        // Preparing thread variable
+        std::optional<std::jthread> simThread = std::nullopt;
+        std::mutex simMutex;
+
+        // Command processing loop
+        std::size_t cycles = 0;
+        std::size_t statusSent = 0;
+        json response;
+        while (true) {
+            response = json::object();
+            // Read message
+            std::cout << getTime() << " Awaiting message..." << std::endl;
+            auto request = server.receive_message();
+            if (!request.has_value()) {
+                std::cout << getTime() << " Connection closed or error occurred" << std::endl;
+                break;
+            }
+
+            // Process message
+            std::string command = (*request)["command"];
+            std::cout << getTime() << " [Received command] " << command << std::endl;
+            if (command == "start") {
+                std::cout << getTime() << " Starting simulation" << std::endl;
+                if (!simThread.has_value()) {
+                    simThread = std::jthread([&sim, &simMutex, &cycles](std::stop_token stop) {
+                        {
+                            std::lock_guard<std::mutex> guard(simMutex);
+                            sim.simulate(true);
+                        }
+                        std::cout << getTime() << " Simulation initialized. Going into main loop." << std::endl;
+                        while (!stop.stop_requested()) {
+                            std::lock_guard<std::mutex> guard(simMutex);
+                            if (cycles % 10000 == 0) {
+                                std::cout << cycles << "   " << sim.getStatus() << std::endl;
+                            }
+                            sim.simulate(false);
+                            ++cycles;
+                            if (sim.isDone()) {
+                                // For now do not clean up the JSON logs, as this is
+                                // done by the "stop" command from the python side of things.
+                                // TODO: However this should be changed when the communication is
+                                // rewritten
+                                sim.commitLogsToDisk(false);
+                                break;
+                            }
+                        }
+                    });
+                } else {
+                    std::lock_guard<std::mutex> guard(simMutex);
+                    sim.resume();
+                }
+                response["state"] = "running";
+                server.send_message(response);
+            } else if (command == "stop") {
+                std::cout << getTime() << " Stopping simulation." << std::endl;
+                std::lock_guard<std::mutex> guard(simMutex);
+                std::cout << getTime() << " Final status: " << sim.getStatus() << std::endl;
+                std::cout << getTime() << " Is done? " << sim.isDone() << std::endl;
+                sim.halt();
+                if (simThread.has_value()) {
+                    simThread->request_stop();
+                }
+                sim.commitLogsToDisk(true);
+                response["state"] = "stopped";
+                server.send_message(response);
+            } else if (command == "pause") {
+                std::cout << getTime() << " Pausing simulation." << std::endl;
+                std::lock_guard<std::mutex> guard(simMutex);
+                if (simThread.has_value()) {
+                    simThread->request_stop();
+                }
+                response["state"] = "halted";
+                server.send_message(response);
+            } else if (command == "status") {
+                std::cout << getTime() << " [Sending] Sending status update " << statusSent + 1 << std::endl;
+                std::lock_guard<std::mutex> guard(simMutex);
+                json status = sim.getStatus();
+                server.send_message(status);
+                statusSent++;
+                std::cout << getTime() << " [Sending] Status " << statusSent << " update sent!" << std::endl;
+            } else {
+                std::cout << getTime() << " Unknown command " << command << std::endl;
+                std::cerr << "Unknown command " << command << std::endl;
+                response["state"] = "unknown_command";
+                server.send_message(response);
+            }
+
+            // Exit if stop command received
+            if ((*request)["command"] == "stop") {
+                break;
+            }
+        }
+        simThread->join();
+    } else {
+        throw std::runtime_error("Socket path not provided. Socket communication is required.");
+    }
+    return 0;
+}
diff --git a/finn_xsi/finn_xsi/LayerSimulationBackend.cpp b/finn_xsi/finn_xsi/LayerSimulationBackend.cpp
new file mode 100644
index 0000000000..314aa0a41a
--- /dev/null
+++ b/finn_xsi/finn_xsi/LayerSimulationBackend.cpp
@@ -0,0 +1,372 @@
+#include <AXIS_Control.h>
+#include <AXI_Control.h>
+#include <Clock.h>
+#include <Design.h>
+#include <Kernel.h>
+#include <Port.h>
+#include <SharedLibrary.h>
+#include <SocketServer.h>
+#include <sys/stat.h>
+
+#include <atomic>
+#include <boost/program_options.hpp>
+#include <boost/program_options/options_description.hpp>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+#include <mutex>
+#include <thread>
+
+#define NDEBUG
+#include <Simulation.hpp>
+#include <rtlsim_config.hpp>
+
+namespace po = boost::program_options;
+
+constexpr std::size_t InstreamCount = RTLSimConfig::istream_descs.size();
+constexpr std::size_t OutstreamCount = RTLSimConfig::ostream_descs.size();
+
+static_assert(InstreamCount == RTLSimConfig::inputInterfaceNames.size(), "Number of input streams must match number of previous nodes");
+static_assert(OutstreamCount == RTLSimConfig::outputInterfaceNames.size(), "Number of output streams must match number of next nodes");
+
+// Simulation state management
+enum class SimulationState { IDLE, CONFIGURED, RUNNING, FINISHED, ERROR };
+
+class SimulationController {
+     private:
+    SingleNodeSimulation<InstreamCount, OutstreamCount, RTLSimConfig::LoggingEnabled, RTLSimConfig::NodeIndex, RTLSimConfig::TotalNodes, RTLSimConfig::IsInputNode,
+                         RTLSimConfig::IsOutputNode>& sim;
+    std::atomic<SimulationState> state{SimulationState::IDLE};
+    std::atomic<uint64_t> current_cycles{0};
+    std::atomic<uint64_t> current_samples{0};
+    std::mutex state_mutex;
+    std::string error_message;
+    std::jthread sim_thread;
+    std::vector<std::size_t> fifo_depths{2};
+    std::size_t max_cycles{std::numeric_limits<std::size_t>::max()};
+    bool timeout_occurred{false};
+
+     public:
+    explicit SimulationController(SingleNodeSimulation<InstreamCount, OutstreamCount, RTLSimConfig::LoggingEnabled, RTLSimConfig::NodeIndex, RTLSimConfig::TotalNodes,
+                                                       RTLSimConfig::IsInputNode, RTLSimConfig::IsOutputNode>& simulation)
+        : sim(simulation) {}
+
+    void configure(const std::vector<std::size_t>& depths, const std::vector<std::size_t>& expected_first_valid_cycles, std::size_t maxCycles) {
+        std::lock_guard<std::mutex> lock(state_mutex);
+        if (state != SimulationState::IDLE && state != SimulationState::FINISHED) {
+            throw std::runtime_error("Cannot configure while simulation is running");
+        }
+        fifo_depths = depths;
+        current_cycles = 0;
+        current_samples = 0;
+        max_cycles = maxCycles;
+        state = SimulationState::CONFIGURED;
+
+        // Reset simulation first
+        sim.reset();
+
+        // Configure FIFO depths AFTER reset
+        std::size_t num_fifos = sim.getFIFOCount();
+
+        if (fifo_depths.empty()) {
+            throw std::runtime_error("FIFO depths not configured");
+        }
+
+        // Apply depths: if list is shorter, use last value for remaining FIFOs
+        for (std::size_t i = 0; i < num_fifos; ++i) {
+            std::size_t depth_idx = std::min(i, fifo_depths.size() - 1);
+            sim.setFIFODepth(i, fifo_depths[depth_idx]);
+        }
+
+        for (std::size_t i = 0; i < expected_first_valid_cycles.size(); ++i) {
+            std::size_t cycles_idx = std::min(i, expected_first_valid_cycles.size() - 1);
+            sim.setFIFOCyclesUntilExpectedFirstValid(i, expected_first_valid_cycles[cycles_idx]);
+        }
+    }
+
+    void start() {
+        std::lock_guard<std::mutex> lock(state_mutex);
+        if (state != SimulationState::CONFIGURED) {
+            throw std::runtime_error("Simulation must be configured before starting");
+        }
+
+        state = SimulationState::RUNNING;
+
+        // Start simulation in a separate thread
+        sim_thread = std::jthread([this](std::stop_token stoken) {
+            try {
+                std::cout << "Starting simulation with max cycles: " << max_cycles << std::endl;
+
+                // Run the simulation
+                bool timeout = sim.runToStableState(stoken, max_cycles);
+
+                if (timeout) {
+                    state = SimulationState::FINISHED;
+                    timeout_occurred = true;
+                }
+
+                // Update state based on completion
+                if (!stoken.stop_requested()) {
+                    current_samples.store(sim.getCompletedMaps());
+                    state = SimulationState::FINISHED;
+                }
+                state = SimulationState::FINISHED;
+            } catch (const std::exception& e) {
+                std::lock_guard<std::mutex> error_lock(state_mutex);
+                std::cout << "Simulation error: " << e.what() << std::endl;
+                error_message = e.what();
+                state = SimulationState::ERROR;
+            }
+        });
+    }
+
+    void stop() {
+        if (sim_thread.joinable()) {
+            sim_thread.request_stop();
+            sim_thread.join();
+        }
+        if (state == SimulationState::RUNNING) {
+            state = SimulationState::FINISHED;
+        }
+    }
+
+    json get_status() const {
+        json status;
+        status["status"] = "success";
+
+        SimulationState current_state = state.load();
+        switch (current_state) {
+            case SimulationState::IDLE:
+                status["state"] = "idle";
+                break;
+            case SimulationState::CONFIGURED:
+                status["state"] = "configured";
+                break;
+            case SimulationState::RUNNING:
+                status["state"] = "running";
+                status["cycles"] = sim.getCyclesRun();
+                status["samples"] = sim.getCompletedMaps();
+                break;
+            case SimulationState::FINISHED:
+                status["state"] = "finished";
+                status["timeout"] = timeout_occurred;
+                if (timeout_occurred) {
+                    status["state"] = "timeout";
+                }
+                status["cycles"] = sim.getCyclesRun();
+                status["samples"] = sim.getCompletedMaps();
+                status["intervals"] = sim.getOStreamStableStateIntervals();
+                // Add FIFO depth data
+                {
+                    auto depths = sim.getFIFODepth();
+                    json fifo_depth = json::array();
+                    for (size_t i = 0; i < depths.size(); ++i) {
+                        fifo_depth.push_back(depths[i]);
+                    }
+                    if (!fifo_depth.empty()) {
+                        status["fifo_depth"] = fifo_depth;
+                    }
+                }
+                // Add FIFO utilization data
+                {
+                    auto utilizations = sim.getFIFOUtilization();
+                    json fifo_util = json::array();
+                    for (size_t i = 0; i < utilizations.size(); ++i) {
+                        fifo_util.push_back(utilizations[i]);
+                    }
+                    if (!fifo_util.empty()) {
+                        status["fifo_utilization"] = fifo_util;
+                    }
+                }
+                // Add FIFO cycles until first valid data
+                {
+                    auto cycles_until_valid = sim.getFIFOCyclesUntilFirstValid();
+                    json fifo_cycles = json::array();
+                    for (size_t i = 0; i < cycles_until_valid.size(); ++i) {
+                        fifo_cycles.push_back(cycles_until_valid[i]);
+                    }
+                    if (!fifo_cycles.empty()) {
+                        status["fifo_cycles_until_first_valid"] = fifo_cycles;
+                    }
+                }
+                // Add input/output job sizes
+                {
+                    json in_job_sizes = json::array();
+                    for (size_t i = 0; i < InstreamCount; ++i) {
+                        in_job_sizes.push_back(sim.getInputJobSize(i));
+                    }
+                    status["input_job_size"] = in_job_sizes;
+
+                    json out_job_sizes = json::array();
+                    for (size_t i = 0; i < OutstreamCount; ++i) {
+                        out_job_sizes.push_back(sim.getOutputJobSize(i));
+                    }
+                    status["output_job_size"] = out_job_sizes;
+                }
+                break;
+            case SimulationState::ERROR:
+                status["state"] = "error";
+                status["message"] = error_message;
+                break;
+        }
+        return status;
+    }
+
+    ~SimulationController() { stop(); }
+};
+
+void process_command(const json& request, json& response, SimulationController& controller) {
+    const std::string command = request["command"];
+    const json& payload = request["payload"];
+
+    try {
+        if (command == "configure") {
+            std::vector<std::size_t> fifo_depths;
+
+            // std::cout << "Payload: " << payload << std::endl;
+
+            // Handle fifo_depth as either a single value or an array
+            if (payload.contains("fifo_depth")) {
+                const auto& depth_value = payload["fifo_depth"];
+                if (depth_value.is_array()) {
+                    for (const auto& val : depth_value) {
+                        fifo_depths.push_back(val.get<std::size_t>());
+                    }
+                } else {
+                    fifo_depths.push_back(depth_value.get<std::size_t>());
+                }
+            } else {
+                fifo_depths.push_back(std::numeric_limits<std::size_t>::max());  // Default value
+            }
+
+            std::vector<std::size_t> expected_first_valid_cycles;
+            if (payload.contains("fifo_first_valid_cycles")) {
+                const auto& expected_cycles_value = payload["fifo_first_valid_cycles"];
+                if (expected_cycles_value.is_array()) {
+                    for (const auto& val : expected_cycles_value) {
+                        expected_first_valid_cycles.push_back(val.get<std::size_t>());
+                    }
+                } else {
+                    expected_first_valid_cycles.push_back(expected_cycles_value.get<std::size_t>());
+                }
+            }
+
+            if (fifo_depths.empty()) {
+                throw std::runtime_error("FIFO depth list cannot be empty");
+            }
+
+            std::size_t max_cycles = std::numeric_limits<size_t>::max();
+            if (payload.contains("max_cycles")) {
+                max_cycles = payload["max_cycles"].get<std::size_t>();
+            }
+
+            controller.configure(fifo_depths, expected_first_valid_cycles, max_cycles);
+            response["status"] = "success";
+            response["message"] = "Configuration successful";
+        } else if (command == "start") {
+            controller.start();
+            response["status"] = "success";
+            response["message"] = "Simulation started";
+        } else if (command == "status") {
+            response = controller.get_status();
+        } else if (command == "stop") {
+            controller.stop();
+            response["status"] = "success";
+            response["message"] = "Simulation stopped";
+            // Include final status with FIFO utilization and depth
+            json final_status = controller.get_status();
+            if (final_status.contains("fifo_utilization")) {
+                response["fifo_utilization"] = final_status["fifo_utilization"];
+            }
+            if (final_status.contains("fifo_depth")) {
+                response["fifo_depth"] = final_status["fifo_depth"];
+            }
+            if (final_status.contains("cycles")) {
+                response["cycles"] = final_status["cycles"];
+            }
+            if (final_status.contains("samples")) {
+                response["samples"] = final_status["samples"];
+            }
+            if (final_status.contains("intervals")) {
+                response["intervals"] = final_status["intervals"];
+            }
+            if (final_status.contains("timeout")) {
+                response["timeout"] = final_status["timeout"];
+            }
+            if (final_status.contains("fifo_cycles_until_first_valid")) {
+                response["fifo_cycles_until_first_valid"] = final_status["fifo_cycles_until_first_valid"];
+            }
+            if (final_status.contains("input_job_size")) {
+                response["input_job_size"] = final_status["input_job_size"];
+            }
+            if (final_status.contains("output_job_size")) {
+                response["output_job_size"] = final_status["output_job_size"];
+            }
+        } else {
+            response["status"] = "error";
+            response["message"] = "Unknown command: " + command;
+        }
+    } catch (const std::exception& e) {
+        response["status"] = "error";
+        response["message"] = std::string("Error: ") + e.what();
+    }
+}
+
+int main(int argc, const char* argv[]) {
+    // Parse CLI options
+    po::options_description desc{"Options"};
+    desc.add_options()("socket,s", po::value<std::string>(), "Unix domain socket path for IPC");
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    std::cout << "Connected Simulation Node Index: " << RTLSimConfig::NodeIndex << " / " << RTLSimConfig::TotalNodes << std::endl;
+
+    // Check if socket communication is enabled
+    if (vm.count("socket")) {
+        const std::string socket_path = vm["socket"].as<std::string>();
+        std::cout << "Initializing socket server at: " << socket_path << std::endl;
+        std::cout.flush();
+
+        SocketServer server(socket_path);
+        if (auto error = server.initialize(); error.has_value()) {
+            std::cerr << "Failed to initialize socket server: " << *error << std::endl;
+            std::cerr.flush();
+            return 1;
+        }
+
+        std::cout << "Socket server initialized, waiting for commands..." << std::endl;
+        std::cout.flush();
+
+        // Construct simulation
+        SingleNodeSimulation<InstreamCount, OutstreamCount, RTLSimConfig::LoggingEnabled, RTLSimConfig::NodeIndex, RTLSimConfig::TotalNodes, RTLSimConfig::IsInputNode,
+                             RTLSimConfig::IsOutputNode>
+            sim(RTLSimConfig::kernel_libname, RTLSimConfig::design_libname, "xsim_log_file.txt", "trace_file.txt", RTLSimConfig::istream_descs, RTLSimConfig::ostream_descs,
+                RTLSimConfig::inputInterfaceNames, RTLSimConfig::outputInterfaceNames, 2);
+
+        // Create simulation controller
+        SimulationController controller(sim);
+
+        // Command processing loop
+        while (true) {
+            auto request = server.receive_message();
+            if (!request.has_value()) {
+                std::cout << "Connection closed or error occurred" << std::endl;
+                break;
+            }
+
+            json response;
+            process_command(*request, response, controller);
+            server.send_message(response);
+
+            // Exit if stop command received
+            if ((*request)["command"] == "stop") {
+                break;
+            }
+        }
+    } else {
+        throw std::runtime_error("Socket path not provided. Socket communication is required.");
+    }
+
+    return 0;
+}
diff --git a/finn_xsi/finn_xsi/adapter.py b/finn_xsi/finn_xsi/adapter.py
index 993aaa95c8..859daeed19 100644
--- a/finn_xsi/finn_xsi/adapter.py
+++ b/finn_xsi/finn_xsi/adapter.py
@@ -78,15 +78,15 @@ def compile_sim_obj(top_module_name, source_list, sim_out_dir, debug=False, beha
         "floating_point_v7_1_18",
         "floating_point_v7_1_15",
         "floating_point_v7_1_19",
+        "work",
     ]
 
     cmd_xelab = [
         "xelab",
-        "work." + top_module_name,
+        "work." + "finn_design_wrapper",
         "-relax",
-        "-prj",
-        "rtlsim.prj",
         "-dll",
+        "--O3",
         "-s",
         top_module_name,
     ]
@@ -105,7 +105,10 @@ def compile_sim_obj(top_module_name, source_list, sim_out_dir, debug=False, beha
     if locate_glbl() is not None:
         cmd_xelab.insert(1, "work.glbl")
 
-    launch_process_helper(cmd_xelab, cwd=sim_out_dir)
+    cmd_xvlog = "xvlog --incr --relax -prj rtlsim.prj".split()
+
+    launch_process_helper(cmd_xvlog, cwd=sim_out_dir, print_stdout=False)
+    launch_process_helper(cmd_xelab, cwd=sim_out_dir, print_stdout=False)
     out_so_relative_path = "xsim.dir/%s/xsimk.so" % top_module_name
     out_so_full_path = sim_out_dir + "/" + out_so_relative_path
 
diff --git a/finn_xsi/finn_xsi/cmake/CompilerWarnings.cmake b/finn_xsi/finn_xsi/cmake/CompilerWarnings.cmake
new file mode 100644
index 0000000000..a606ab5163
--- /dev/null
+++ b/finn_xsi/finn_xsi/cmake/CompilerWarnings.cmake
@@ -0,0 +1,115 @@
+# from here:
+#
+# https://github.com/lefticus/cppbestpractices/blob/master/02-Use_the_Tools_Available.md
+
+function(
+  fifosim_set_project_warnings
+  project_name
+  WARNINGS_AS_ERRORS
+  MSVC_WARNINGS
+  CLANG_WARNINGS
+  GCC_WARNINGS
+  CUDA_WARNINGS)
+  if("${MSVC_WARNINGS}" STREQUAL "")
+    set(MSVC_WARNINGS
+        /W4 # Baseline reasonable warnings
+        /w14242 # 'identifier': conversion from 'type1' to 'type2', possible loss of data
+        /w14254 # 'operator': conversion from 'type1:field_bits' to 'type2:field_bits', possible loss of data
+        /w14263 # 'function': member function does not override any base class virtual member function
+        /w14265 # 'classname': class has virtual functions, but destructor is not virtual instances of this class may not
+                # be destructed correctly
+        /w14287 # 'operator': unsigned/negative constant mismatch
+        /we4289 # nonstandard extension used: 'variable': loop control variable declared in the for-loop is used outside
+                # the for-loop scope
+        /w14296 # 'operator': expression is always 'boolean_value'
+        /w14311 # 'variable': pointer truncation from 'type1' to 'type2'
+        /w14545 # expression before comma evaluates to a function which is missing an argument list
+        /w14546 # function call before comma missing argument list
+        /w14547 # 'operator': operator before comma has no effect; expected operator with side-effect
+        /w14549 # 'operator': operator before comma has no effect; did you intend 'operator'?
+        /w14555 # expression has no effect; expected expression with side- effect
+        /w14619 # pragma warning: there is no warning number 'number'
+        /w14640 # Enable warning on thread un-safe static member initialization
+        /w14826 # Conversion from 'type1' to 'type2' is sign-extended. This may cause unexpected runtime behavior.
+        /w14905 # wide string literal cast to 'LPSTR'
+        /w14906 # string literal cast to 'LPWSTR'
+        /w14928 # illegal copy-initialization; more than one user-defined conversion has been implicitly applied
+        /permissive- # standards conformance mode for MSVC compiler.
+    )
+  endif()
+
+  if("${CLANG_WARNINGS}" STREQUAL "")
+    set(CLANG_WARNINGS
+        -Wall
+        -Wextra # reasonable and standard
+        -Wshadow # warn the user if a variable declaration shadows one from a parent context
+        -Wnon-virtual-dtor # warn the user if a class with virtual functions has a non-virtual destructor. This helps
+        # catch hard to track down memory errors
+        -Wold-style-cast # warn for c-style casts
+        -Wcast-align # warn for potential performance problem casts
+        -Wunused # warn on anything being unused
+        -Woverloaded-virtual # warn if you overload (not override) a virtual function
+        -Wpedantic # warn if non-standard C++ is used
+        -Wconversion # warn on type conversions that may lose data
+        -Wsign-conversion # warn on sign conversions
+        -Wnull-dereference # warn if a null dereference is detected
+        -Wdouble-promotion # warn if float is implicit promoted to double
+        -Wformat=2 # warn on security issues around functions that format output (ie printf)
+        -Wimplicit-fallthrough # warn on statements that fallthrough without an explicit annotation
+    )
+  endif()
+
+  if("${GCC_WARNINGS}" STREQUAL "")
+    set(GCC_WARNINGS
+        ${CLANG_WARNINGS}
+        -Wmisleading-indentation # warn if indentation implies blocks where blocks do not exist
+        -Wduplicated-cond # warn if if / else chain has duplicated conditions
+        -Wduplicated-branches # warn if if / else branches have duplicated code
+        -Wlogical-op # warn about logical operations being used where bitwise were probably wanted
+        -Wuseless-cast # warn if you perform a cast to the same type
+    )
+  endif()
+
+  if("${CUDA_WARNINGS}" STREQUAL "")
+    set(CUDA_WARNINGS
+        -Wall
+        -Wextra
+        -Wunused
+        -Wconversion
+        -Wshadow
+        # TODO add more Cuda warnings
+    )
+  endif()
+
+  if(WARNINGS_AS_ERRORS)
+    message(TRACE "Warnings are treated as errors")
+    list(APPEND CLANG_WARNINGS -Werror)
+    list(APPEND GCC_WARNINGS -Werror)
+    list(APPEND MSVC_WARNINGS /WX)
+  endif()
+
+  if(MSVC)
+    set(PROJECT_WARNINGS_CXX ${MSVC_WARNINGS})
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES ".*Clang")
+    set(PROJECT_WARNINGS_CXX ${CLANG_WARNINGS})
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set(PROJECT_WARNINGS_CXX ${GCC_WARNINGS})
+  else()
+    message(AUTHOR_WARNING "No compiler warnings set for CXX compiler: '${CMAKE_CXX_COMPILER_ID}'")
+    # TODO support Intel compiler
+  endif()
+
+  # use the same warning flags for C
+  set(PROJECT_WARNINGS_C "${PROJECT_WARNINGS_CXX}")
+
+  set(PROJECT_WARNINGS_CUDA "${CUDA_WARNINGS}")
+
+  target_compile_options(
+    ${project_name}
+    INTERFACE # C++ warnings
+              $<$<COMPILE_LANGUAGE:CXX>:${PROJECT_WARNINGS_CXX}>
+              # C warnings
+              $<$<COMPILE_LANGUAGE:C>:${PROJECT_WARNINGS_C}>
+              # Cuda warnings
+              $<$<COMPILE_LANGUAGE:CUDA>:${PROJECT_WARNINGS_CUDA}>)
+endfunction()
diff --git a/finn_xsi/finn_xsi/cmake/InterproceduralOptimization.cmake b/finn_xsi/finn_xsi/cmake/InterproceduralOptimization.cmake
new file mode 100644
index 0000000000..c5c513d14a
--- /dev/null
+++ b/finn_xsi/finn_xsi/cmake/InterproceduralOptimization.cmake
@@ -0,0 +1,7 @@
+include(CheckIPOSupported)
+check_ipo_supported(RESULT result OUTPUT output)
+if(result)
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION ON)
+else()
+message(SEND_ERROR "IPO is not supported: ${output}")
+endif()
diff --git a/finn_xsi/finn_xsi/include/AXIS_Control.h b/finn_xsi/finn_xsi/include/AXIS_Control.h
new file mode 100644
index 0000000000..c7f9a96f8b
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/AXIS_Control.h
@@ -0,0 +1,91 @@
+#ifndef AXIS_CONTROL
+#define AXIS_CONTROL
+
+#include <CommunicationChannel.hpp>
+#include <StableStateTracker.hpp>
+#include <functional>
+#include <string>
+#include <stop_token>
+
+// Fwd declarations
+namespace xsi {
+    class Design;
+    class Port;
+}  // namespace xsi
+class Clock;
+
+class AXIS_Control : public CommunicationChannel {
+     public:
+    // Constructor/destructor
+    AXIS_Control(xsi::Design& design, Clock& clock, size_t job_size, const std::string& prefix = "s_axis_");
+    AXIS_Control() = default;
+    virtual ~AXIS_Control() noexcept = default;
+
+    AXIS_Control(AXIS_Control&& other) = default;
+    AXIS_Control& operator=(AXIS_Control&& other) = default;
+
+    void inititialized_or_throw();
+
+    // Core functions - immediate writes
+    virtual void setInputValid(bool value = true, std::stop_token stoken = {}) override;
+    virtual bool getOutputValid(std::stop_token stoken = {}) noexcept override;
+    virtual void setOutputReady(bool value = true, std::stop_token stoken = {}) override;
+    virtual bool getInputReady(std::stop_token stoken = {}) noexcept override;
+
+    // Deferred write functions
+    std::reference_wrapper<xsi::Port> setValid(bool value = true);
+    std::reference_wrapper<xsi::Port> setReady(bool value = true);
+
+    virtual void writeBack() = 0;
+
+    // Job Size and Transaction Statistics
+    size_t job_size;
+    size_t job_txns;  // [0:job_size]
+    size_t total_txns;
+    size_t first_complete;  // First completion timestamp
+
+    // AXI interface prefix
+    std::string name;
+
+     protected:
+    const xsi::Design* design;
+    const Clock* clk;
+
+    xsi::Port* port_vld;
+    xsi::Port* port_rdy;
+};
+
+class S_AXIS_Control : public AXIS_Control {
+     public:
+    // Constructor/destructor
+    S_AXIS_Control(xsi::Design& design, Clock& clock, size_t job_size, size_t job_ticks, const std::string& prefix = "s_axis_");
+    S_AXIS_Control() = default;
+    ~S_AXIS_Control() noexcept = default;
+
+    S_AXIS_Control(S_AXIS_Control&& other) = default;
+    S_AXIS_Control& operator=(S_AXIS_Control&& other) = default;
+
+    void writeBack() override;
+
+    size_t job_ticks;   // throttle if job_size < job_ticks
+    size_t await_iter;  // iteration allowing start of next job
+};
+
+class M_AXIS_Control : public AXIS_Control {
+     public:
+    // Constructor/destructor
+    M_AXIS_Control(xsi::Design& design, Clock& clock, size_t job_size, const std::string& prefix = "m_axis_");
+    M_AXIS_Control() = default;
+    ~M_AXIS_Control() noexcept = default;
+
+    M_AXIS_Control(M_AXIS_Control&& other) = default;
+    M_AXIS_Control& operator=(M_AXIS_Control&& other) = default;
+
+    void writeBack() override;
+
+    size_t lastComplete = 0;
+    size_t interval = 0;
+    StableStateTracker<> stableState;
+};
+
+#endif /* AXIS_CONTROL */
diff --git a/finn_xsi/finn_xsi/include/AXI_Control.h b/finn_xsi/finn_xsi/include/AXI_Control.h
new file mode 100644
index 0000000000..24e0e11237
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/AXI_Control.h
@@ -0,0 +1,40 @@
+#ifndef AXI_CONTROL
+#define AXI_CONTROL
+
+#include <cstdint>
+#include <string>
+
+// Fwd declarations
+namespace xsi {
+    class Design;
+    class Port;
+}  // namespace xsi
+class Clock;
+
+class AXI_Control {
+     public:
+    // Constructor/destructor
+    AXI_Control(xsi::Design& design, Clock& clock, const std::string& axi_prefix = "AXI_Control_0_0_");
+    ~AXI_Control() noexcept = default;
+
+    // // Core register access functions
+    void writeRegister(uint32_t addr, uint32_t data);
+    uint32_t readRegister(uint32_t addr);
+
+     private:
+    // AXI interface prefix
+    std::string prefix;
+    xsi::Design& design;
+    Clock& clk;
+
+    // Helper functions for multi-bit signal handling
+    void writeAddr(const std::string& signal, uint32_t addr);
+    void writeData(const std::string& signal, uint32_t data);
+    void writeStrb(const std::string& signal, uint32_t strb);
+    uint32_t read(const std::string& signal);
+    void setBool(const std::string& signal);
+    void clearBool(const std::string& signal);
+    bool chkBool(const std::string& signal);
+};
+
+#endif /* AXI_CONTROL */
diff --git a/finn_xsi/finn_xsi/include/Clock.h b/finn_xsi/finn_xsi/include/Clock.h
new file mode 100644
index 0000000000..334d69690b
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/Clock.h
@@ -0,0 +1,36 @@
+#ifndef CLOCK
+#define CLOCK
+
+#include <functional>
+
+// Fwd declarations
+namespace xsi {
+    class Design;
+}
+
+class Clock {
+    xsi::Design& design;
+
+    Clock(Clock const&) = delete;
+    Clock& operator=(Clock const&) = delete;
+    Clock(xsi::Design& design);
+    template<size_t IStreamsSize, size_t OStreamsSize, bool LoggingEnabled>
+    friend class Simulation;
+
+     public:
+    Clock(Clock&&) noexcept = default;
+    Clock& operator=(Clock&&) noexcept = default;
+    ~Clock() noexcept = default;
+
+    std::function<void()> clkHigh;
+    std::function<void()> clkLow;
+    std::function<void(bool)> cycle;
+
+
+    void toggleClk() noexcept;
+
+    void clockHigh() noexcept;
+    void clockLow() noexcept;
+};
+
+#endif /* CLOCK */
diff --git a/finn_xsi/finn_xsi/include/CommunicationChannel.hpp b/finn_xsi/finn_xsi/include/CommunicationChannel.hpp
new file mode 100644
index 0000000000..1f94659da9
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/CommunicationChannel.hpp
@@ -0,0 +1,74 @@
+#ifndef COMMUNICATIONCHANNEL
+#define COMMUNICATIONCHANNEL
+
+#include <concepts>
+#include <stop_token>
+#include <iostream>
+
+template<typename T>
+concept ChannelInterface = requires(T t, bool b, std::stop_token stoken) {
+    { t.getOutputValid(stoken) } -> std::same_as<bool>;
+    { t.setInputValid(b, stoken) } -> std::same_as<void>;
+    { t.getInputReady(stoken) } -> std::same_as<bool>;
+    { t.setOutputReady(b, stoken) } -> std::same_as<void>;
+};
+
+class CommunicationChannel {
+    // Function pointers for downstream object methods
+    bool (*downstreamGetInputReadyFn)(void*, std::stop_token) = nullptr;
+    void (*downstreamSetInputValidFn)(void*, bool, std::stop_token) = nullptr;
+
+    void* downstreamObj = nullptr;
+
+     protected:
+    // Derived classes call this to register their own methods
+    template<ChannelInterface Derived>
+    void registerSelfAs() {
+        // This is intentionally empty - we call methods directly on 'this'
+        // The template just ensures Derived implements ChannelInterface
+    }
+
+     public:
+    template<ChannelInterface Derived>
+    void connectDownstream(Derived& downstreamPartner) {
+        this->downstreamObj = &downstreamPartner;
+
+        // Store function pointers for calling the DOWNSTREAM object's methods
+        downstreamGetInputReadyFn = [](void* obj, std::stop_token stoken) -> bool { return static_cast<Derived*>(obj)->getInputReady(stoken); };
+        downstreamSetInputValidFn = [](void* obj, bool v, std::stop_token stoken) { static_cast<Derived*>(obj)->setInputValid(v, stoken); };
+    }
+
+    // Mark as inline and noexcept for better optimization
+    inline void exchangeDataDownstream(std::stop_token stoken = {}) noexcept {
+        // Call methods on THIS object directly (non-virtual, resolved at compile time)
+        bool valid = this->getOutputValid(stoken);
+        // Call downstream object's methods via function pointers
+        downstreamSetInputValidFn(downstreamObj, valid, stoken);
+        bool ready = downstreamGetInputReadyFn(downstreamObj, stoken);
+        // Call method on THIS object directly
+        this->setOutputReady(ready, stoken);
+    }
+
+    virtual bool getOutputValid([[maybe_unused]] std::stop_token stoken = {}) = 0;
+    virtual void setInputValid([[maybe_unused]] bool v, [[maybe_unused]] std::stop_token stoken = {}) = 0;
+    virtual bool getInputReady([[maybe_unused]] std::stop_token stoken = {}) = 0;
+    virtual void setOutputReady([[maybe_unused]] bool r, [[maybe_unused]] std::stop_token stoken = {}) = 0;
+
+    virtual ~CommunicationChannel() = default;
+};
+
+// Example usage:
+// class LayerA : public CommunicationChannel {
+// public:
+//     bool getOutputValid(std::stop_token stoken = {}) { /* ... */ }
+//     void setInputValid(bool v, std::stop_token stoken = {}) { /* ... */ }
+//     bool getInputReady(std::stop_token stoken = {}) { /* ... */ }
+//     void setOutputReady(bool r, std::stop_token stoken = {}) { /* ... */ }
+// };
+//
+// LayerA a;
+// LayerB b;
+// a.connectDownstream(b);
+// a.exchangeDataDownstream(); // or with stop_token: a.exchangeDataDownstream(stoken);
+
+#endif /* COMMUNICATIONCHANNEL */
diff --git a/finn_xsi/finn_xsi/include/Design.h b/finn_xsi/finn_xsi/include/Design.h
new file mode 100644
index 0000000000..83a9016c43
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/Design.h
@@ -0,0 +1,53 @@
+#ifndef DESIGN
+#define DESIGN
+
+#include <Kernel.h>
+
+namespace xsi {
+
+    //	- non-copyable handle for exposing simulation control.
+    class Design {
+        xsi::Kernel _kernel;
+
+         public:
+        Design(xsi::Kernel& kernel, const std::string& design_lib, const s_xsi_setup_info& setup_info);
+        Design(xsi::Kernel& kernel, const std::string& design_lib, const char* const log_file = nullptr, const char* const wdb_file = nullptr);
+        ~Design();
+
+         private:
+        Design(Design const&) = delete;
+        Design& operator=(Design const&) = delete;
+
+         public:
+        // Move constructor
+        Design(Design&& other) noexcept;
+
+        // Move assignment operator
+        Design& operator=(Design&& other) noexcept;
+
+        //-----------------------------------------------------------------------
+        // Forwarded Access to Open Simulation
+
+        // Simulation Control & Status
+         public:
+        void trace_all();
+        void run(const XSI_INT64 step);
+        void restart();
+
+        int get_status() const noexcept;
+        const char* get_error_info() const noexcept;
+
+        // Port Access
+         public:
+        int num_ports() const noexcept;
+
+        xsi::Port& getPort(const std::string& name);
+        const xsi::Port& getPort(const std::string& name) const;
+
+        std::span<xsi::Port> ports() noexcept;
+        std::span<const xsi::Port> ports() const noexcept;
+
+    };  // class Design
+}  // namespace xsi
+
+#endif /* DESIGN */
diff --git a/finn_xsi/finn_xsi/include/FIFO.h b/finn_xsi/finn_xsi/include/FIFO.h
new file mode 100644
index 0000000000..9cb09f9970
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/FIFO.h
@@ -0,0 +1,41 @@
+#ifndef FIFO_H
+#define FIFO_H
+
+#include <CommunicationChannel.hpp>
+#include <cstdint>
+#include <limits>
+#include <stop_token>
+
+class FIFO : public CommunicationChannel {
+    uint64_t maxUtil = 0;
+    uint64_t currentUtil = 0;
+    uint64_t maxSize = 0;
+    uint64_t nextUtil = 0;
+    uint64_t cyclesUntilExpectedFirstValid = std::numeric_limits<uint64_t>::max();
+    uint64_t initialCyclesUntilExpectedFirstValid = std::numeric_limits<uint64_t>::max();
+
+     public:
+    FIFO(uint64_t size = std::numeric_limits<uint64_t>::max());
+    ~FIFO();
+
+    void update(bool incomingValid, bool incomingReady);
+    bool toggleClock();
+    virtual bool getInputReady(std::stop_token stoken = {}) noexcept override;
+    virtual bool getOutputValid(std::stop_token stoken = {}) noexcept override;
+    bool isEmpty() const;
+    void reset(uint64_t size = std::numeric_limits<uint64_t>::max());
+    void setCyclesUntilExpectedFirstValid(uint64_t cycles);
+    uint64_t getCyclesUntilFirstValid() const;
+    void setMaxSize(const uint64_t size);
+    uint64_t getMaxSize() const;
+    uint64_t getSpaceLeft() const;
+    uint64_t getMaxUtil() const;
+    void increaseCounter(const uint64_t count);
+
+    // NOTE: User needs to ensure proper ordering. No runtime enforcement of order.
+    virtual void setInputValid(bool incomingValid, std::stop_token stoken = {}) override;
+    virtual void setOutputReady(bool incomingReady, std::stop_token stoken = {}) override;
+    uint64_t size() const;
+};
+
+#endif /* FIFO_H */
diff --git a/finn_xsi/finn_xsi/include/InterprocessCommunicationChannel.hpp b/finn_xsi/finn_xsi/include/InterprocessCommunicationChannel.hpp
new file mode 100644
index 0000000000..e8ba6d85fc
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/InterprocessCommunicationChannel.hpp
@@ -0,0 +1,247 @@
+#ifndef INTERPROCESSCOMMUNICATIONCHANNEL
+#define INTERPROCESSCOMMUNICATIONCHANNEL
+
+#include <atomic>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <new>
+#include <thread>
+#include <iostream>
+
+#ifndef CACHE_LINE_SIZE
+    #ifdef __cpp_lib_hardware_interference_size
+constexpr std::size_t CACHE_LINE_SIZE = std::hardware_destructive_interference_size;
+    #else
+constexpr std::size_t CACHE_LINE_SIZE = 64;
+    #endif
+#endif
+
+namespace bip = boost::interprocess;
+
+// ===== INTERPROCESS ASYMMETRIC REQUEST-RESPONSE EXCHANGE =====
+// Concepts for constraining methods based on role
+template<bool IsSender>
+concept Sender = IsSender;
+
+constexpr int MAX_SPIN_WAIT = 100;
+
+template<typename Request, typename Response, bool IsSender, std::size_t SharedMemorySize = 4096>
+class InterprocessCommunicationChannel {
+     private:
+    // ===== SHARED MEMORY STRUCTURE =====
+    struct alignas(CACHE_LINE_SIZE) SharedChannelData {
+        struct alignas(CACHE_LINE_SIZE) RequestSlot {
+            Request data;
+            std::atomic<bool> valid;
+
+            RequestSlot() : data(), valid(false) {}
+        };
+
+        struct alignas(CACHE_LINE_SIZE) ResponseSlot {
+            Response data;
+            std::atomic<bool> valid;
+
+            ResponseSlot() : data(), valid(false) {}
+        };
+
+        // Double-buffered requests and responses
+        RequestSlot requests[2];
+        ResponseSlot responses[2];
+
+        alignas(CACHE_LINE_SIZE) std::atomic<int> request_write_idx;
+        alignas(CACHE_LINE_SIZE) std::atomic<int> request_read_idx;
+        alignas(CACHE_LINE_SIZE) std::atomic<int> response_write_idx;
+        alignas(CACHE_LINE_SIZE) std::atomic<int> response_read_idx;
+
+        SharedChannelData() : request_write_idx(0), request_read_idx(0), response_write_idx(0), response_read_idx(0) {
+            // Verify atomics are lock-free (required for shared memory)
+            static_assert(std::atomic<bool>::is_always_lock_free, "std::atomic<bool> must be lock-free for inter-process use");
+            static_assert(std::atomic<int>::is_always_lock_free, "std::atomic<int> must be lock-free for inter-process use");
+        }
+    };
+
+    // ===== PROCESS-LOCAL STATE =====
+    SharedChannelData* channel = nullptr;
+    std::atomic<int>* refCount = nullptr;
+    const std::string sharedMemoryName;
+    bip::managed_shared_memory shmem;
+
+     public:
+    // Default constructor
+    InterprocessCommunicationChannel() : sharedMemoryName("") {}
+
+    // Constructor with shared memory name
+    InterprocessCommunicationChannel(const std::string& shmName) : sharedMemoryName(shmName) {
+        if constexpr (IsSender) {
+            // Sender creates shared memory
+            bip::shared_memory_object::remove(sharedMemoryName.c_str());
+            shmem = bip::managed_shared_memory(bip::create_only, sharedMemoryName.c_str(), SharedMemorySize);
+            std::cout << "Created shared memory: " << sharedMemoryName << std::endl;
+        } else {
+            // Receiver opens existing shared memory
+            std::cout << "Waiting to connect to shared memory: " << sharedMemoryName << std::endl;
+            while (true) {
+                try {
+                    shmem = bip::managed_shared_memory(bip::open_only, sharedMemoryName.c_str());
+                    break;
+                } catch (const bip::interprocess_exception& e) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
+            }
+            std::cout << "Connected to shared memory: " << sharedMemoryName << std::endl;
+        }
+
+        // Construct or find the reference counter
+        refCount = shmem.find_or_construct<std::atomic<int>>("refCount")(0);
+        refCount->fetch_add(1, std::memory_order_acq_rel);
+
+        // Construct the channel data in shared memory
+        channel = shmem.find_or_construct<SharedChannelData>("ChannelData")();
+
+    }
+
+    void handshake() {
+        // Perform handshake to verify communication works
+        if constexpr (IsSender) {
+            // Sender: send test request and wait for response
+            std::cout << "Sending handshake test request for " << sharedMemoryName << std::endl;
+            Request test_request{};
+            Response test_response = send_request(test_request);
+            std::cout << "Received handshake test response for " << sharedMemoryName << std::endl;
+            // Communication verified if we got here without hanging
+        } else {
+            // Receiver: wait for test request and send response
+            std::cout << "Waiting for handshake test request for " << sharedMemoryName << std::endl;
+            Request test_request = receive_request();
+            std::cout << "Received handshake test request for " << sharedMemoryName << std::endl;
+            Response test_response{};
+            send_response(test_response);
+            std::cout << "Sent handshake test response for " << sharedMemoryName << std::endl;
+            // Communication verified if we got here
+        }
+    }
+
+    // Delete copy operations
+    InterprocessCommunicationChannel(const InterprocessCommunicationChannel&) = delete;
+    InterprocessCommunicationChannel& operator=(const InterprocessCommunicationChannel&) = delete;
+
+    // Move constructor
+    InterprocessCommunicationChannel(InterprocessCommunicationChannel&& other) noexcept
+        : channel(other.channel), refCount(other.refCount), sharedMemoryName(std::move(other.sharedMemoryName)), shmem(std::move(other.shmem)) {
+        other.channel = nullptr;
+        other.refCount = nullptr;
+    }
+
+    // Move assignment operator
+    InterprocessCommunicationChannel& operator=(InterprocessCommunicationChannel&& other) noexcept {
+        if (this != &other) {
+            channel = other.channel;
+            refCount = other.refCount;
+            shmem.swap(other.shmem);
+            const_cast<std::string&>(sharedMemoryName) = std::move(other.sharedMemoryName);
+
+            other.channel = nullptr;
+            other.refCount = nullptr;
+        }
+        return *this;
+    }
+
+    ~InterprocessCommunicationChannel() {
+        if (!refCount || !channel) {
+            return;
+        }
+
+        channel = nullptr;
+        refCount = nullptr;
+
+        std::atomic<int>* ref_ptr = shmem.find<std::atomic<int>>("refCount").first;
+        if (!ref_ptr) {
+            return;
+        }
+
+        int remainingRefs = ref_ptr->fetch_sub(1, std::memory_order_acq_rel) - 1;
+
+        if (remainingRefs == 0) {
+            shmem.destroy<SharedChannelData>("ChannelData");
+            shmem.destroy<std::atomic<int>>("refCount");
+            shmem = bip::managed_shared_memory();
+            bip::shared_memory_object::remove(sharedMemoryName.c_str());
+        }
+    }
+
+    // SENDER SIDE: Send request, wait for response
+    Response send_request(const Request& req, std::stop_token stoken = {})
+        requires Sender<IsSender>
+    {
+        // Write request
+        int write_slot = channel->request_write_idx.load(std::memory_order_acquire) % 2;
+        channel->requests[write_slot].data = req;
+        channel->requests[write_slot].valid.store(true, std::memory_order_release);
+        channel->request_write_idx.fetch_add(1, std::memory_order_release);
+
+        // Wait for response in corresponding slot
+        int read_slot = channel->response_read_idx.load(std::memory_order_acquire) % 2;
+        int spin_count = 0;
+        while (!channel->responses[read_slot].valid.load(std::memory_order_acquire) && !stoken.stop_requested()) {
+            if (spin_count++ >= MAX_SPIN_WAIT) {
+                std::this_thread::yield();
+                spin_count = 0;
+            } else {
+#if defined(__x86_64__) || defined(_M_X64)
+                __builtin_ia32_pause();
+#elif defined(__aarch64__)
+                asm volatile("yield" ::: "memory");
+#endif
+            }
+        }
+
+        if (stoken.stop_requested()) {
+            return Response{};  // Return default-constructed response on cancellation
+        }
+
+        Response resp = channel->responses[read_slot].data;
+        channel->responses[read_slot].valid.store(false, std::memory_order_release);
+        channel->response_read_idx.fetch_add(1, std::memory_order_release);
+
+        return resp;
+    }
+
+    // RECEIVER SIDE: Wait for request, send response
+    Request receive_request(std::stop_token stoken = {})
+        requires(!Sender<IsSender>)
+    {
+        int read_slot = channel->request_read_idx.load(std::memory_order_acquire) % 2;
+        int spin_count = 0;
+
+        while (!channel->requests[read_slot].valid.load(std::memory_order_acquire) && !stoken.stop_requested()) {
+            if (spin_count++ >= MAX_SPIN_WAIT) {
+                std::this_thread::yield();
+                spin_count = 0;
+            } else {
+#if defined(__x86_64__) || defined(_M_X64)
+                __builtin_ia32_pause();
+#elif defined(__aarch64__)
+                asm volatile("yield" ::: "memory");
+#endif
+            }
+        }
+
+        if (stoken.stop_requested()) {
+            return Request{};  // Return default-constructed request on cancellation
+        }
+
+        Request req = channel->requests[read_slot].data;
+        channel->requests[read_slot].valid.store(false, std::memory_order_release);
+        channel->request_read_idx.fetch_add(1, std::memory_order_release);
+
+        return req;
+    }
+
+    void send_response(const Response& resp)
+        requires(!Sender<IsSender>)
+    {
+        int write_slot = channel->response_write_idx.load(std::memory_order_acquire) % 2;
+        channel->responses[write_slot].data = resp;
+        channel->responses[write_slot].valid.store(true, std::memory_order_release);
+        channel->response_write_idx.fetch_add(1, std::memory_order_release);
+    }
+};
+
+#endif /* INTERPROCESSCOMMUNICATIONCHANNEL */
diff --git a/finn_xsi/finn_xsi/include/InterprocessCommunicationChannelInterface.hpp b/finn_xsi/finn_xsi/include/InterprocessCommunicationChannelInterface.hpp
new file mode 100644
index 0000000000..1ae409c0af
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/InterprocessCommunicationChannelInterface.hpp
@@ -0,0 +1,72 @@
+#ifndef INTERPROCESSCOMMUNICATIONCHANNELINTERFACE
+#define INTERPROCESSCOMMUNICATIONCHANNELINTERFACE
+
+#include <CommunicationChannel.hpp>
+#include <InterprocessCommunicationChannel.hpp>
+#include <stop_token>
+
+template<bool IsSender>
+class InterprocessCommunicationChannelInterface : public CommunicationChannel {
+    struct Forward {
+        bool valid;
+    };
+
+    struct Backward {
+        bool ready;
+    };
+
+    InterprocessCommunicationChannel<Forward, Backward, IsSender> channel;
+    Backward lastResponse;
+
+     public:
+    // Default constructor
+    InterprocessCommunicationChannelInterface() = default;
+
+    // Constructor with shared memory name
+    explicit InterprocessCommunicationChannelInterface(const std::string& shmName) : channel(shmName), lastResponse{false} {}
+
+    // Delete copy operations
+    InterprocessCommunicationChannelInterface(const InterprocessCommunicationChannelInterface&) = delete;
+    InterprocessCommunicationChannelInterface& operator=(const InterprocessCommunicationChannelInterface&) = delete;
+
+    // Move constructor
+    InterprocessCommunicationChannelInterface(InterprocessCommunicationChannelInterface&& other) noexcept = default;
+
+    // Move assignment operator
+    InterprocessCommunicationChannelInterface& operator=(InterprocessCommunicationChannelInterface&& other) noexcept = default;
+
+    virtual bool getInputReady([[maybe_unused]] std::stop_token stoken = {}) override {
+        if constexpr (!IsSender) {
+            throw std::runtime_error("getInputReady can only be called on sender instances.");
+        } else {
+            return lastResponse.ready;
+        }
+
+    }
+    virtual bool getOutputValid(std::stop_token stoken = {}) override {
+        if constexpr (IsSender) {
+            throw std::runtime_error("getOutputValid can only be called on receiver instances.");
+        } else {
+            return channel.receive_request(stoken).valid;
+        }
+    }
+
+    virtual void setInputValid(bool incomingValid, std::stop_token stoken = {}) override {
+        if constexpr (!IsSender) {
+            throw std::runtime_error("setInputValid can only be called on sender instances.");
+        } else {
+            lastResponse = channel.send_request(Forward{incomingValid}, stoken);
+        }
+    }
+    virtual void setOutputReady(bool incomingReady, [[maybe_unused]] std::stop_token stoken = {}) override {
+        if constexpr (IsSender) {
+            throw std::runtime_error("setOutputReady can only be called on receiver instances.");
+        } else {
+            channel.send_response(Backward{incomingReady});
+        }
+    }
+
+    virtual ~InterprocessCommunicationChannelInterface() = default;
+};
+
+#endif /* INTERPROCESSCOMMUNICATIONCHANNELINTERFACE */
diff --git a/finn_xsi/finn_xsi/include/IsolatedSimulation.hpp b/finn_xsi/finn_xsi/include/IsolatedSimulation.hpp
new file mode 100644
index 0000000000..8b9636efc0
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/IsolatedSimulation.hpp
@@ -0,0 +1,208 @@
+#include <Simulation.hpp>
+#include "SocketServer.h"
+
+
+template<size_t IStreamsSize, size_t OStreamsSize>
+class IsolatedSimulation : public Simulation<IStreamsSize, OStreamsSize, false> {
+    enum class LogType {READY, VALID};
+    std::string readylogName;
+    std::string validlogName;
+    nlohmann::ordered_json readyJson;
+    nlohmann::ordered_json validJson;
+    std::vector<size_t> inJobSizes;
+    std::vector<size_t> outJobSizes;
+
+    /**
+     * For the given streams check which has the largest job size, and return a tuple
+     * (stream_index, job_size) for that stream.
+     **/
+    std::tuple<size_t, size_t> getLargestTxnsStream(std::vector<size_t>& jobSizes) {
+        size_t l = 0;
+        size_t idx = 0;
+        for (size_t i = 0; i < jobSizes.size(); i++) {
+            if (jobSizes[i] > l) {
+                l = jobSizes[i];
+                idx = i;
+            }
+        }
+        return std::make_tuple(idx, l);
+    }
+
+    class SimState {
+        public:
+        bool running;
+        size_t inputCyclesDone;
+        size_t inputCyclesTarget;
+        size_t inputLargestStreamIndex;
+        size_t outputCyclesDone;
+        size_t outputCyclesTarget;
+        size_t outputLargestStreamIndex;
+        size_t totalCycles;
+
+        SimState(IsolatedSimulation<IStreamsSize, OStreamsSize>& sim) {
+            reset(sim);
+        }
+        void reset(IsolatedSimulation<IStreamsSize, OStreamsSize>& sim) {
+            totalCycles = 0;
+            inputCyclesDone = 0;
+            outputCyclesDone = 0;
+            running = false;
+            auto largestIn = sim.getLargestTxnsStream(sim.inJobSizes);
+            auto largestOut = sim.getLargestTxnsStream(sim.outJobSizes);
+            inputCyclesTarget = std::get<1>(largestIn) * 2;
+            inputLargestStreamIndex = std::get<0>(largestIn);
+            outputCyclesTarget = std::get<1>(largestOut) * 2;
+            outputLargestStreamIndex = std::get<0>(largestOut);
+        }
+        inline bool inputCyclesProcessed() { return inputCyclesDone >= inputCyclesTarget; }
+        inline bool outputCyclesProcessed() { return outputCyclesDone >= outputCyclesTarget; }
+        inline bool allCyclesProcessed() { return inputCyclesProcessed() && outputCyclesProcessed(); }
+        inline bool isRunning() { return running; }
+        void setRunning(bool v) { running = v; }
+        std::string getCycleStateInput() { return std::to_string(totalCycles) + "," + std::to_string(inputCyclesDone) + "," + std::to_string(inputCyclesTarget); }
+        std::string getCycleStateOutput() { return std::to_string(totalCycles) + "," + std::to_string(outputCyclesDone) + "," + std::to_string(outputCyclesTarget); }
+        json getStatus() {
+            json j;
+            if (!running && allCyclesProcessed()) {
+                j["state"] = "done";
+            } else {
+                j["state"] = running ? "running" : "halted";
+            }
+            j["totalCycles"] = totalCycles;
+            j["inputCyclesDone"] = inputCyclesDone;
+            j["inputCyclesTarget"] = inputCyclesTarget;
+            j["outputCyclesDone"] = outputCyclesDone;
+            j["outputCyclesTarget"] = outputCyclesTarget;
+            return j;
+        }
+    };
+
+
+    /** Log the ready and valid signals to the JSON fields **/
+    void logReady() {
+        nlohmann::ordered_json j;
+        j["totalCycles"] = simState.totalCycles;
+        j["inputCyclesDone"] = simState.inputCyclesDone;
+        j["inputCyclesTarget"] = simState.inputCyclesTarget;
+        for (S_AXIS_Control& s : this->istreams) {
+            j[s.name] = s.getInputReady();
+        }
+        readyJson.push_back(j);
+    }
+
+    void logValid() {
+        nlohmann::ordered_json j;
+        j["totalCycles"] = simState.totalCycles;
+        j["outputCyclesDone"] = simState.outputCyclesDone;
+        j["outputCyclesTarget"] = simState.outputCyclesTarget;
+        for (M_AXIS_Control& s : this->ostreams) {
+            j[s.name] = s.getOutputValid();
+        }
+        validJson.push_back(j);
+    }
+
+    SimState simState;
+
+    public:
+    IsolatedSimulation(
+        const std::string& kernel_lib,
+        const std::string& design_lib,
+        const char* xsim_log_file,
+        const char* trace_file,
+        std::array<StreamDescriptor, IStreamsSize> _istream_descs,
+        std::array<StreamDescriptor, OStreamsSize> _ostream_descs
+    ) : Simulation<IStreamsSize, OStreamsSize, false>(
+        kernel_lib, design_lib, xsim_log_file, trace_file, _istream_descs, _ostream_descs
+    ), simState(*this), readyJson(json::array()), validJson(json::array()),
+    readylogName("readylog.txt"), validlogName("validlog.txt") {
+        // TODO: Clearly split names between connected and isolated sim (ready_log.txt and readylog.txt)
+        inJobSizes.resize(_istream_descs.size());
+        outJobSizes.resize(_ostream_descs.size());
+        std::transform(
+            _istream_descs.begin(),
+            _istream_descs.end(),
+            inJobSizes.begin(),
+            [](StreamDescriptor& s) { return s.job_size; }
+        );
+        std::transform(
+            _ostream_descs.begin(),
+            _ostream_descs.end(),
+            outJobSizes.begin(),
+            [](StreamDescriptor& s) { return s.job_size; }
+        );
+    }
+
+    /** Write logs to disk **/
+    void commitLogsToDisk(bool clearLogs = true) {
+        std::ofstream r(readylogName, std::ios::trunc);
+        std::ofstream v(validlogName, std::ios::trunc);
+        r << std::setw(4) << readyJson;
+        std::cout << "Writing ready log: " << readyJson.size() << " elements." << std::endl;
+        v << std::setw(4) << validJson;
+        std::cout << "Writing valid log: " << validJson.size() << " elements." << std::endl;
+        r.close();
+        v.close();
+        if (clearLogs) {
+            readyJson = json::array();
+            validJson = json::array();
+        }
+    }
+
+    json getStatus() {
+        return simState.getStatus();
+    }
+
+    void halt() {
+        simState.setRunning(false);
+    }
+
+    void resume() {
+        simState.setRunning(true);
+    }
+
+    bool isRunning() { return simState.isRunning(); }
+
+    bool isDone() {
+        return !simState.isRunning() && simState.allCyclesProcessed();
+    }
+
+    /***
+     * Simulate a single cycle
+     ***/
+    void simulate(bool restart = false) {
+        if (restart) {
+            simState.reset(*this);
+            simState.setRunning(true);
+            std::cout << "Sim set to running: " << simState.isRunning() << std::endl;
+            std::cout << "Target input/output cycles: " << simState.inputCyclesTarget << ", " << simState.outputCyclesTarget << std::endl;
+            this->clearPorts();
+            this->reset();
+            for (S_AXIS_Control& s : this->istreams) {
+                s.setInputValid(true);
+            }
+            for (M_AXIS_Control& s : this->ostreams) {
+                s.setOutputReady(true);
+            }
+        }
+
+        if (!simState.isRunning()) {
+            std::cout << "Simulation not running! Send \"start\" command first." << std::endl;
+            return;
+        }
+        if (!simState.allCyclesProcessed()) {
+            logValid();
+            logReady();
+
+            if (!simState.inputCyclesProcessed() && this->istreams[simState.inputLargestStreamIndex].getInputReady()) {
+                ++simState.inputCyclesDone;
+            }
+            if (!simState.outputCyclesProcessed() && this->ostreams[simState.outputLargestStreamIndex].getOutputValid()) {
+                ++simState.outputCyclesDone;
+            }
+            this->clk.toggleClk();
+            ++simState.totalCycles;
+        } else {
+           simState.setRunning(false);
+        }
+    }
+};
diff --git a/finn_xsi/finn_xsi/include/Kernel.h b/finn_xsi/finn_xsi/include/Kernel.h
new file mode 100644
index 0000000000..c7713ea00d
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/Kernel.h
@@ -0,0 +1,133 @@
+#ifndef KERNEL_H_
+#define KERNEL_H_
+
+#include <SharedLibrary.h>
+
+#include <algorithm>
+#include <optional>
+#include <span>
+#include <vector>
+
+#include "xsi.h"
+
+namespace xsi {
+
+    // Forward declarations
+    class Design;
+    class Port;
+
+    class Kernel {
+        //-----------------------------------------------------------------------
+        // Dispatch Table for XSI Functions
+        class Xsi {
+            //- Statics ---------------------
+             public:
+            // Function Indeces
+            static constexpr unsigned get_value = 0, put_value = 1, get_int_port = 2, get_str_port = 3,
+
+                                      get_int = 4, get_port_number = 5,
+
+                                      trace_all = 6, run = 7, restart = 8, get_status = 9, get_error_info = 10,
+
+                                      close = 11;
+
+             private:
+            // Function Names & Types
+            static constexpr unsigned EXTENT = 12;
+            static char const* const FUNC_NAMES[EXTENT];
+            using type_map = std::tuple<
+                // Port Access
+                t_fp_xsi_get_value, t_fp_xsi_put_value, t_fp_xsi_get_int_port, t_fp_xsi_get_str_port,
+
+                // Design Inspection
+                t_fp_xsi_get_int, t_fp_xsi_get_port_number,
+
+                // Simulation Control & Status
+                t_fp_xsi_trace_all, t_fp_xsi_run, t_fp_xsi_restart, t_fp_xsi_get_status, t_fp_xsi_get_error_info,
+
+                // Closing
+                t_fp_xsi_close>;
+
+            //- Actual Contents -------------
+             private:
+            xsiHandle _hdl;
+            void* _func[EXTENT];
+
+            //- Lifecycle: in-place structure inside Kernel only
+             public:
+            Xsi(xsi::SharedLibrary& lib);
+            ~Xsi() {}
+
+             private:
+            Xsi(Xsi const&) = delete;
+            Xsi& operator=(Xsi const&) = delete;
+
+             public:
+            // Move constructor
+            Xsi(Xsi&& other) noexcept;  // Move assignment operator
+            Xsi& operator=(Xsi&& other) noexcept;
+
+            //- Handle Update ---------------
+             public:
+            void setHandle(xsiHandle hdl) noexcept;
+            bool hasValidHandle() const noexcept;
+
+            //- XSI Function Invocation -----
+             public:
+            template<unsigned FID, typename... Args>
+            auto invoke(Args&&... args) const {
+                auto const f = decltype(std::get<FID>(type_map()))(_func[FID]);
+                return (*f)(_hdl, std::forward<Args>(args)...);
+            }
+
+        };  // class Xsi
+
+         private:
+        // Instance State
+        xsi::SharedLibrary _kernel_lib;  // Backing Kernel Library
+        Xsi _xsi;                        // XSI Dispatch Table
+
+        // Optional State once a Design in open
+        xsi::SharedLibrary _design_lib;
+        std::vector<Port> _ports;
+
+         public:
+        Kernel(const std::string& kernel_lib);
+        Kernel(Kernel const&) = delete;
+        Kernel& operator=(Kernel const&) = delete;
+
+        // Move constructor
+        Kernel(Kernel&& other) noexcept;
+        // Move assignment operator
+        Kernel& operator=(Kernel&& other) noexcept;
+
+        ~Kernel();
+
+        // Interface reserved for forwarded access through open Design
+         private:
+        friend Design;
+        friend Port;
+        template<unsigned FID, typename... Args>
+        auto xsi(Args&&... args) const {
+            return _xsi.invoke<FID>(std::forward<Args>(args)...);
+        }
+
+        // Port Accessors inlined below and public through Design
+        Port& getPort(const char* const name);
+        const Port& getPort(const char* const name) const;
+        std::span<Port> ports() noexcept;
+        std::span<const Port> ports() const noexcept;
+
+        // Design con- & destruction hooks
+        void open(const std::string& design_lib, const s_xsi_setup_info& setup_info);
+        void close() noexcept;
+
+         public:
+        // Port count accessor for Design class
+        size_t port_count() const noexcept;
+
+    };  // class Kernel
+
+}  // namespace xsi
+
+#endif /* KERNEL_H_ */
diff --git a/finn_xsi/finn_xsi/include/Port.h b/finn_xsi/finn_xsi/include/Port.h
new file mode 100644
index 0000000000..0b75b0ecfa
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/Port.h
@@ -0,0 +1,64 @@
+#ifndef PORT_H_
+#define PORT_H_
+
+#include <string>
+#include <vector>
+
+#include "xsi.h"
+
+namespace xsi {
+
+    class Kernel;  // Forward declaration
+
+    // Only exists within controlled environment within Kernel with open Design.
+    class Port {
+        Kernel& _kernel;
+        unsigned const _id;
+        std::vector<s_xsi_vlog_logicval> buffer;
+
+         private:
+        friend Kernel;
+        // Con- and destruction under full control of Kernel
+        Port(Port const&) = delete;
+        Port& operator=(Port const&) = delete;
+        Port(Kernel& kernel, const unsigned id);
+
+         public:
+        Port(Port&& other) noexcept;
+        ~Port() noexcept;
+
+         public:
+        const char* name() const noexcept;
+        int dir() const noexcept;
+        unsigned width() const noexcept;
+
+        bool isInput() const noexcept;
+        bool isOutput() const noexcept;
+        bool isInout() const noexcept;
+
+         public:
+        // Buffer Synchronization
+        Port& read();
+        void write_back();
+
+        // Inspection
+        bool hasUnknown() const noexcept;
+        bool isZero() const noexcept;
+        bool operator[](const unsigned idx) const noexcept;
+
+        bool as_bool() const noexcept;
+        unsigned as_unsigned() const noexcept;
+        std::string as_binstr() const;
+        std::string as_hexstr() const;
+
+        // Manipulation
+        Port& clear();
+        Port& set(const unsigned val);
+        Port& set_binstr(const std::string& val);
+        Port& set_hexstr(const std::string& val);
+
+    };  // class Port
+
+}  // namespace xsi
+
+#endif /* PORT_H_ */
diff --git a/finn_xsi/finn_xsi/include/SharedLibrary.h b/finn_xsi/finn_xsi/include/SharedLibrary.h
new file mode 100644
index 0000000000..0f5e768f6c
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/SharedLibrary.h
@@ -0,0 +1,69 @@
+#ifndef SHAREDLIBRARY_H_
+#define SHAREDLIBRARY_H_
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#if defined(_WIN32)
+    #include <windows.h>
+#else
+    #include <dlfcn.h>
+#endif
+
+namespace xsi {
+    class SharedLibrary {
+         public:
+        static char const library_suffix[];
+
+         private:
+        using handle_type =
+#if defined(_WIN32)
+            HINSTANCE;
+#else
+            void*;
+#endif
+
+        //-----------------------------------------------------------------------
+        // Instance State
+         private:
+        handle_type _lib;
+        std::string _path;
+
+        //-----------------------------------------------------------------------
+        // Life Cycle
+         public:
+        SharedLibrary();
+        SharedLibrary(const std::string& path);
+        ~SharedLibrary();
+
+         private:
+        SharedLibrary(SharedLibrary const&) = delete;
+        SharedLibrary& operator=(SharedLibrary const&) = delete;
+
+         public:
+        // Move constructor
+        SharedLibrary(SharedLibrary&& other) noexcept;
+
+        // Move assignment operator
+        SharedLibrary& operator=(SharedLibrary&& other) noexcept;
+
+         public:
+        operator bool() const noexcept;
+        SharedLibrary& open(const std::string& path);
+        SharedLibrary& close() noexcept;
+
+         private:
+        static handle_type load(const std::string& path);
+        void unload() noexcept;
+
+        //-----------------------------------------------------------------------
+        // Accessors
+         public:
+        const std::string& path() const noexcept;
+        std::optional<void*> getsymbol(const char* const name);
+
+    };  // class SharedLibrary
+}  // namespace xsi
+
+#endif /* SHAREDLIBRARY_H_ */
diff --git a/finn_xsi/finn_xsi/include/Simulation.hpp b/finn_xsi/finn_xsi/include/Simulation.hpp
new file mode 100644
index 0000000000..64bcc304ea
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/Simulation.hpp
@@ -0,0 +1,392 @@
+#ifndef SIMULATION
+#define SIMULATION
+#include <AXIS_Control.h>
+#include <Clock.h>
+#include <Design.h>
+#include <FIFO.h>
+#include <Kernel.h>
+#include <Port.h>
+#include <SharedLibrary.h>
+#include <helper.h>
+
+#include <InterprocessCommunicationChannelInterface.hpp>
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <optional>
+#include <stdexcept>
+#include <stop_token>
+#include <string>
+#include <string_view>
+
+
+template<size_t IStreamsSize, size_t OStreamsSize, bool LoggingEnabled>
+class Simulation {
+     protected:
+    std::ofstream readyLog;
+    std::ofstream validLog;
+
+     public:
+    xsi::Kernel kernel;
+    xsi::Design top;
+    // S_AXIS_Control goes into the simulated layer
+    std::array<S_AXIS_Control, IStreamsSize> istreams;
+    // M_AXIS_Control comes from the simulated layer
+    std::array<M_AXIS_Control, OStreamsSize> ostreams;
+    Clock clk;
+
+
+    Simulation(const std::string& kernel_lib, const std::string& design_lib, const char* xsim_log_file, const char* trace_file,
+               std::array<StreamDescriptor, IStreamsSize> _istream_descs, std::array<StreamDescriptor, OStreamsSize> _ostream_descs)
+        : kernel(kernel_lib), top(kernel, design_lib, xsim_log_file, trace_file), clk(top) {
+        if (trace_file) {
+            top.trace_all();
+        }
+
+        // Find I/O Streams and initialize their Status
+        for (size_t i = 0; i < _istream_descs.size(); ++i) {
+            istreams[i] = S_AXIS_Control{top, clk, std::data(_istream_descs)[i].job_size, std::data(_istream_descs)[i].job_size, std::data(_istream_descs)[i].name};
+        }
+        for (size_t i = 0; i < _ostream_descs.size(); ++i) {
+            ostreams[i] = M_AXIS_Control{top, clk, std::data(_ostream_descs)[i].job_size, std::data(_ostream_descs)[i].name};
+        }
+
+        // Save simulation input output behaviour
+        if constexpr (LoggingEnabled) {
+            readyLog.open("ready_log.txt");
+            validLog.open("valid_log.txt");
+        }
+
+        // Find Global Control & Run Startup Sequence
+        clearPorts();
+        reset();
+    }
+
+    template<std::size_t Index>
+    bool hasValidOutput() {
+        // static_assert(Index < ostreams.size(), "Cannot request valid status of unknown output stream index");
+        return ostreams[Index].is_valid();
+    }
+
+    void clearPorts() noexcept {
+        // Clear all input ports
+        for (xsi::Port& p : top.ports()) {
+            if (p.isInput()) {
+                p.clear().write_back();
+            }
+        }
+    }
+
+    void reset() noexcept {
+        xsi::Port& rst_n = top.getPort("ap_rst_n");
+        // Reset all Inputs, Wait for Reset Period
+        rst_n.set(0).write_back();
+        for (unsigned i = 0; i < 16; i++) {
+            clk.toggleClk();
+        }
+        rst_n.set(1).write_back();
+    }
+};
+
+// Small struct used for exange. Will be changed later to more complex data structure.
+struct CommData {
+    bool data;
+};
+
+// Communication Flow:
+//
+//           valid      ┌──────────────────────────────────────┐     valid            valid
+//   SHM   ─────────>   │         valid            valid       │    ─────────>  FIFO  ─────>   SHM
+//  (pred) <───────── istream  ─────────>  xsim  ─────────> ostream <─────────        <─────  (succ)
+//           ready      │      <─────────        <─────────    │     ready            ready
+//                      │         ready            ready       │
+//                      │                  (sim)               │
+//                      └──────────────────────────────────────┘
+template<size_t IStreamsSize, size_t OStreamsSize, bool LoggingEnabled, size_t NodeIndex, size_t TotalNodes, bool FirstNode, bool LastNode>
+class SingleNodeSimulation : public Simulation<IStreamsSize, OStreamsSize, LoggingEnabled> {
+    using ConsumingInterface = InterprocessCommunicationChannel<CommData, CommData, true>;
+    using ProducingInterface = InterprocessCommunicationChannel<CommData, CommData, false>;
+    std::array<ConsumingInterface, IStreamsSize> fromProducerInterface;
+    std::array<ProducingInterface, OStreamsSize> toConsumerInterface;
+    std::size_t cyclesRun = 0;
+    std::size_t completedMaps = 0;
+    std::array<FIFO, OStreamsSize> fifo;
+
+    /**
+     * Initialize streams according to nodeindex
+     */
+    void initStreams() {
+        if constexpr (FirstNode) {             // First Node; no predecessor
+            for (auto&& s : this->istreams) {  // Input into sim valid
+                s.setInputValid(true);
+            }
+        } else if constexpr (LastNode) {       // Last Node; no successor
+            for (auto&& s : this->ostreams) {  // Output from sim ready
+                s.setOutputReady(true);
+            }
+        }
+    }
+
+    [[gnu::hot, gnu::always_inline]] bool runSingleCycle(std::stop_token stoken = {}) {
+        ++cyclesRun;
+        bool ret = false;
+        if constexpr (!FirstNode) {
+            for (std::size_t i = 0; i < IStreamsSize; ++i) {
+                // Interface SHM <-> sim
+                bool istreamReady = this->istreams[i].getInputReady();
+                bool fifoValid = fromProducerInterface[i].send_request(CommData{istreamReady}, stoken).data;
+                this->istreams[i].setValid(fifoValid);  // deferred
+            }
+        }
+        if constexpr (!LastNode) {
+            for (std::size_t i = 0; i < OStreamsSize; ++i) {
+                // Interface sim -valid-> FIFO
+                this->fifo[i].setInputValid(this->ostreams[i].getOutputValid(), stoken);
+                // Interface FIFO <-> SHM
+                this->fifo[i].setOutputReady(toConsumerInterface[i].receive_request(stoken).data, stoken);
+                
+                // Toggle FIFO clock
+                ret |= this->fifo[i].toggleClock();
+                bool fifoValid = this->fifo[i].getOutputValid();
+                toConsumerInterface[i].send_response(CommData{fifoValid});
+                // FIFO -ready-> sim
+                this->ostreams[i].setReady(this->fifo[i].getInputReady());
+            }
+        }
+        if constexpr (LastNode) {
+            for (auto&& stream : this->ostreams) {
+                if (stream.getOutputValid() && ++stream.job_txns == stream.job_size) {
+                    // Track job completion and intervals
+                    std::size_t lastComplete = stream.lastComplete;
+                    stream.interval = cyclesRun - lastComplete;
+                    stream.lastComplete = cyclesRun;
+                    stream.job_txns = 0;
+                    ++completedMaps;
+                    if (lastComplete != 0) {
+                        // Update stable state tracker
+                        stream.stableState.update(stream.interval);
+                    }
+                }
+            }
+        }
+        // ── CLOCK HIGH ─────────────────────────────────────────────────────────
+        this->clk.clockHigh();  // run(1) [gap] → clk=1 → run(1)
+
+        // ── WRITE (clock is high, commit deferred setValid / setReady) ─────────
+        //
+        // The deferred values were prepared at the end of the previous cycle's read
+        // phase (or are defaults for the first cycle).
+        for (std::size_t i = 0; i < IStreamsSize; ++i) {
+            this->istreams[i].writeBack();
+        }
+        for (std::size_t i = 0; i < OStreamsSize; ++i) {
+            this->ostreams[i].writeBack();
+        }
+
+        // ── CLOCK LOW ──────────────────────────────────────────────────────────
+        this->clk.clockLow();  // run(4999) → clk=0 → run(4999)  ← sim settles
+        return ret;
+    }
+
+     public:
+    SingleNodeSimulation(const std::string& kernel_lib, const std::string& design_lib, const char* xsim_log_file, const char* trace_file,
+                         std::array<StreamDescriptor, IStreamsSize> _istream_descs, std::array<StreamDescriptor, OStreamsSize> _ostream_descs,
+                         std::array<std::string_view, IStreamsSize> inputInterfaceNames, std::array<std::string_view, OStreamsSize> outputInterfaceNames,
+                         unsigned int initialFIFODepth = 2)
+        : Simulation<IStreamsSize, OStreamsSize, LoggingEnabled>(kernel_lib, design_lib, xsim_log_file, trace_file, _istream_descs, _ostream_descs) {
+        if (!FirstNode && inputInterfaceNames.empty()) {
+            throw std::runtime_error("Cannot communicate with predecessor because previous node name was not given!");
+        }
+        if (!LastNode && outputInterfaceNames.empty()) {
+            throw std::runtime_error(
+                "Cannot communicate with successor because "
+                "current node name was not given!");
+        }
+
+        if constexpr (!LastNode) {
+            // Create FIFO buffer
+            for (std::size_t i = 0; i < OStreamsSize; ++i) {
+                fifo[i] = FIFO(initialFIFODepth);
+            }
+        }
+
+        std::cout << "Initialized " << OStreamsSize << " output FIFOs with depth " << initialFIFODepth << std::endl;
+
+        if constexpr (!LastNode) {
+            // Create consumer facing interfaces
+            for (std::size_t i = 0; i < OStreamsSize; ++i) {
+                std::string shmName{outputInterfaceNames[i]};
+                toConsumerInterface[i] = std::move(ProducingInterface(shmName));
+            }
+        }
+
+        std::cout << "Initialized " << OStreamsSize << " producing interfaces for successor communication" << std::endl;
+
+        if constexpr (!FirstNode) {
+            for (std::size_t i = 0; i < IStreamsSize; ++i) {
+                std::string shmName{inputInterfaceNames[i]};
+                fromProducerInterface[i] = std::move(ConsumingInterface(shmName));
+            }
+        }
+
+        std::cout << "Initialized " << IStreamsSize << " consuming interfaces for predecessor communication" << std::endl;
+
+        // Verify communication works
+        if constexpr (!LastNode) {
+            for (std::size_t i = 0; i < OStreamsSize; ++i) {
+                toConsumerInterface[i].handshake();
+            }
+        }
+        if constexpr (!FirstNode) {
+            for (std::size_t i = 0; i < IStreamsSize; ++i) {
+                fromProducerInterface[i].handshake();
+            }
+        }
+
+        this->clk.clockHigh();
+        initStreams();
+        this->clk.clockLow();
+        std::cout << "Finished initializing simulation." << std::endl;
+    }
+
+    /// Reset simulation (stream and current FIFO depth, as well as cycle counter)
+    void reset() {
+        Simulation<IStreamsSize, OStreamsSize, LoggingEnabled>::reset();
+        if constexpr (!LastNode) {
+            // Reset FIFOs
+            for (std::size_t i = 0; i < OStreamsSize; ++i) {
+                fifo[i].reset();
+            }
+        }
+    }
+
+    [[gnu::hot, gnu::always_inline]] void runFeatureMaps(std::size_t featureMaps, std::stop_token stoken = {}) {
+        completedMaps = 0;
+        while (completedMaps < featureMaps && !stoken.stop_requested()) {
+            runSingleCycle(stoken);
+        }
+    }
+
+    [[gnu::hot, gnu::always_inline]] bool runToStableState(std::stop_token stoken = {}, std::size_t max_cycles = std::numeric_limits<std::size_t>::max()) {
+        bool timeout = false;
+        while (!std::all_of(this->ostreams.begin(), this->ostreams.end(), [](const M_AXIS_Control& stream) { return stream.stableState.is_stable(); }) & !stoken.stop_requested() &
+               (cyclesRun <= max_cycles) & !timeout) {
+            timeout |= runSingleCycle(stoken);
+            timeout |= runSingleCycle(stoken);
+            timeout |= runSingleCycle(stoken);
+            timeout |= runSingleCycle(stoken);
+        }
+        return timeout || cyclesRun > max_cycles;
+    }
+
+    /// Get the number of FIFOs
+    std::size_t getFIFOCount() const noexcept {
+        if constexpr (LastNode) {
+            return 0;
+        }
+        return OStreamsSize;
+    }
+
+    /// Set the depth of a specific FIFO
+    void setFIFODepth(std::size_t index, std::size_t depth) {
+        if constexpr (LastNode) {
+            throw std::runtime_error("Cannot set FIFO depth on last node (no FIFOs present)");
+        }
+        if (index >= OStreamsSize) {
+            auto error = "FIFO index " + std::to_string(index) + " out of range (max: " + std::to_string(OStreamsSize - 1) + ")";
+            throw std::out_of_range(error);
+        }
+        fifo[index].setMaxSize(depth);
+    }
+
+    void setFIFOCyclesUntilExpectedFirstValid(std::size_t index, std::size_t cycles) {
+        if constexpr (LastNode) {
+            throw std::runtime_error("Cannot set FIFO cycles until expected first valid on last node (no FIFOs present)");
+        }
+        if (index >= OStreamsSize) {
+            auto error = "FIFO index " + std::to_string(index) + " out of range (max: " + std::to_string(OStreamsSize - 1) + ")";
+            throw std::out_of_range(error);
+        }
+        fifo[index].setCyclesUntilExpectedFirstValid(cycles);
+    }
+
+    /// Set the max FIFO depth of all interfaces
+    void setMaxFIFODepth(std::size_t depth) {
+        if constexpr (!LastNode) {
+            for (FIFO& f : fifo) {
+                f.setMaxSize(depth);
+            }
+        }
+    }
+
+    std::array<std::size_t, OStreamsSize> getFIFODepth() const noexcept {
+        if constexpr (LastNode) {
+            return {};
+        }
+        std::array<std::size_t, OStreamsSize> utilizations{};
+        for (std::size_t i = 0; i < OStreamsSize; ++i) {
+            utilizations[i] = fifo[i].getMaxSize();
+        }
+        return utilizations;
+    }
+
+    std::array<std::size_t, OStreamsSize> getFIFOCyclesUntilFirstValid() const noexcept {
+        if constexpr (LastNode) {
+            return {};
+        }
+        std::array<std::size_t, OStreamsSize> cycles{};
+        for (std::size_t i = 0; i < OStreamsSize; ++i) {
+            cycles[i] = fifo[i].getCyclesUntilFirstValid();
+        }
+        return cycles;
+    }
+
+    /// Get the job size of the specified output stream
+    std::size_t getOutputJobSize(std::size_t outputIndex = 0) { return this->ostreams[outputIndex].job_size; }
+
+    /// Get the job size of the specified input stream
+    std::size_t getInputJobSize(std::size_t inputIndex = 0) { return this->istreams[inputIndex].job_size; }
+
+    /// Get the number of cycles the simulation has run
+    std::size_t getCyclesRun() const noexcept { return cyclesRun; }
+
+    /// Get the number of completed feature maps
+    std::size_t getCompletedMaps() const noexcept { return completedMaps; }
+
+    /// Get the maximum FIFO utilization for each output stream
+    std::array<std::size_t, OStreamsSize> getFIFOUtilization() const noexcept {
+        if constexpr (LastNode) {
+            return {};
+        }
+        std::array<std::size_t, OStreamsSize> utilizations{};
+        for (std::size_t i = 0; i < OStreamsSize; ++i) {
+            utilizations[i] = fifo[i].getMaxUtil();
+        }
+        return utilizations;
+    }
+
+    /// Get the current Ostream stable state intervals.
+    /// Returns the rounded EMA of observed output intervals so that a single noisy
+    /// measurement at the boundary of stability does not cause _check_performance to
+    /// report a false positive or negative (raw last interval can differ from the EMA
+    /// by up to the StableStateTracker stability threshold in either direction).
+    /// This should not be the case, but its an additional security measure.
+    std::array<std::size_t, OStreamsSize> getOStreamStableStateIntervals() const noexcept {
+        std::array<std::size_t, OStreamsSize> intervals{};
+        if constexpr (LastNode) {
+            for (std::size_t i = 0; i < OStreamsSize; ++i) {
+                const double ema = this->ostreams[i].stableState.get_ema();
+                // Fall back to the raw interval when the EMA has never been updated
+                // (ema == 0.0 means no second job completion has occurred yet).
+                intervals[i] = (ema > 0.0) ? static_cast<std::size_t>(std::round(ema)) : this->ostreams[i].interval;
+            }
+        }
+        return intervals;
+    }
+};
+
+
+#endif /* SIMULATION */
diff --git a/finn_xsi/finn_xsi/include/SocketServer.h b/finn_xsi/finn_xsi/include/SocketServer.h
new file mode 100644
index 0000000000..9d7e597b9c
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/SocketServer.h
@@ -0,0 +1,40 @@
+#ifndef SOCKET_SERVER_H
+#define SOCKET_SERVER_H
+
+#include <nlohmann/json.hpp>
+#include <optional>
+#include <string>
+#include <string_view>
+
+using json = nlohmann::ordered_json;
+
+class SocketServer {
+     private:
+    int server_fd{-1};
+    int client_fd{-1};
+    std::string socket_path;
+
+    void close_fd(int& fd) noexcept;
+
+     public:
+    explicit SocketServer(std::string_view path);
+    ~SocketServer();
+
+    // Disable copy construction and assignment
+    SocketServer(const SocketServer&) = delete;
+    SocketServer& operator=(const SocketServer&) = delete;
+
+    // Enable move semantics
+    SocketServer(SocketServer&& other) noexcept;
+    SocketServer& operator=(SocketServer&& other) noexcept;
+
+    // Returns std::nullopt on success, error message on failure
+    [[nodiscard]] std::optional<std::string> initialize();
+    [[nodiscard]] std::optional<json> receive_message();
+    void send_message(const json& message);
+    void close_connection() noexcept;
+
+    [[nodiscard]] bool is_connected() const noexcept { return client_fd >= 0; }
+};
+
+#endif  // SOCKET_SERVER_H
diff --git a/finn_xsi/finn_xsi/include/StableStateTracker.hpp b/finn_xsi/finn_xsi/include/StableStateTracker.hpp
new file mode 100644
index 0000000000..e7da06726d
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/StableStateTracker.hpp
@@ -0,0 +1,84 @@
+#ifndef STABLESTATETRACKER
+#define STABLESTATETRACKER
+
+#include <cstdint>
+#include <concepts>
+
+/**
+ * Implements an Exponential Moving Average (EMA) tracker with stability detection.
+ * The tracker updates its EMA with new unsigned integral values and checks for stability
+ * based on relative changes over consecutive updates.
+ */
+template<double Alpha = 0.3,              // alpha * 100 (20 = 0.20)
+         double StabilityThreshold = 0.1,  // threshold * 100 (5 = 0.05)
+         uint8_t RequiredStableCount = 3>
+    requires (Alpha > 0 && Alpha <= 1) &&
+             (StabilityThreshold > 0 && StabilityThreshold < 1) &&
+             (RequiredStableCount > 0)
+class StableStateTracker {
+private:
+    static constexpr double InvAlpha = 1.0 - Alpha;
+    static constexpr double SquaredStabilityThreshold = StabilityThreshold * StabilityThreshold;
+
+    double ema;
+    uint8_t stableCount;
+
+public:
+    constexpr StableStateTracker() noexcept
+        : ema{0.0}
+        , stableCount{0}
+    {
+    }
+
+    /**
+     * Update with new interval value
+     * Concepts ensure only unsigned integral types are accepted
+     */
+    inline void update(std::unsigned_integral auto value) noexcept {
+        // First update initializes directly
+        if (ema == 0.0) [[unlikely]] {
+            ema = static_cast<double>(value);
+            stableCount = 0;
+            return;
+        }
+
+        const double oldEma = ema;
+        const double valDouble = static_cast<double>(value);
+
+        // EMA calculation: ema = value + (1-alpha) * (oldEma - value)
+        ema = valDouble + InvAlpha * (oldEma - valDouble);
+
+        // Stability check: |change|² / oldEma² < threshold²
+        // Avoids sqrt and abs operations
+        const double diff = ema - oldEma;
+        const double squaredRelativeChange = (diff * diff) / (oldEma * oldEma);
+
+        // Branchless increment/reset using arithmetic
+        const bool is_change_small = squaredRelativeChange < SquaredStabilityThreshold;
+        stableCount = is_change_small * (stableCount + (stableCount < RequiredStableCount));
+    }
+
+    [[nodiscard]] constexpr double get_ema() const noexcept {
+        return ema;
+    }
+
+    [[nodiscard]] constexpr bool is_stable() const noexcept {
+        return stableCount >= RequiredStableCount;
+    }
+
+    [[nodiscard]] constexpr uint8_t get_stable_count() const noexcept {
+        return stableCount;
+    }
+
+    constexpr void reset() noexcept {
+        ema = 0.0;
+        stableCount = 0;
+    }
+
+    // Get compile-time parameters
+    static consteval double get_alpha() { return Alpha; }
+    static consteval double get_stability_threshold() { return StabilityThreshold; }
+    static consteval uint8_t get_required_stable_count() { return RequiredStableCount; }
+};
+
+#endif /* STABLESTATETRACKER */
diff --git a/finn_xsi/finn_xsi/include/helper.h b/finn_xsi/finn_xsi/include/helper.h
new file mode 100644
index 0000000000..aa13c4612a
--- /dev/null
+++ b/finn_xsi/finn_xsi/include/helper.h
@@ -0,0 +1,25 @@
+#ifndef HELPER_H_
+#define HELPER_H_
+
+#include <array>
+#include <string>
+#include <iostream>
+
+constexpr std::array<char, 4> XZ10 = {'0', '1', 'Z', 'X'};
+constexpr std::array<char, 16> HEX = {'0', '1', '2', '3', '4', '5', '6', '7',
+                                      '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
+
+struct StreamDescriptor {
+  std::string name;
+  std::size_t job_size;
+  // // Next job can only start this many clock ticks after start of predecessor.
+  // std::size_t job_ticks;
+};
+
+#ifdef NDEBUG
+[[maybe_unused]] inline void debug([[maybe_unused]] std::string_view s) {}
+#else
+inline void debug(std::string_view s) { std::cout << "log [DBG] " << s << "\n"; }
+#endif
+
+#endif /* HELPER_H_ */
diff --git a/finn_xsi/finn_xsi/rtlsim_config.hpp.template b/finn_xsi/finn_xsi/rtlsim_config.hpp.template
index 3e0b35cc87..6b442ade25 100644
--- a/finn_xsi/finn_xsi/rtlsim_config.hpp.template
+++ b/finn_xsi/finn_xsi/rtlsim_config.hpp.template
@@ -11,33 +11,50 @@
  *	prior to compilation.
  ***************************************************************************/
 
-struct stream_desc {
-	char const *name;
-	size_t      job_size;
-	// Next job can only start this many clock ticks after start of predecessor.
-	size_t      job_ticks;
-};
+#include <cstddef>
+#include <initializer_list>
+#include <string>
+#include <optional>
+#include <helper.h>
+#include <string_view>
 
-// sim kernel .so to use (depends on Vivado version)
-static char const  kernel_libname[] = "@SIMKERNEL_SO@";
+namespace RTLSimConfig {
+	// Log during simulation. Turned off by default. Might increase runtime if used.
+	constexpr bool LoggingEnabled = true;
+	constexpr bool IsInputNode = @IS_INPUT_NODE@;
+	constexpr bool IsOutputNode = @IS_OUTPUT_NODE@;
 
-// design library .so to use (important to use this relative path here,
-// due to how XSI looks for certain files)
-static char const  design_libname[] = "xsim.dir/@TOP_MODULE_NAME@/xsimk.so";
+	/**** General RTLSIM Configuration Parameters ****/
+	constexpr std::array<std::string_view, @INPUT_INTERFACE_COUNT@> inputInterfaceNames { @INPUT_INTERFACE_NAMES@ };
+	constexpr std::array<std::string_view, @OUTPUT_INTERFACE_COUNT@> outputInterfaceNames { @OUTPUT_INTERFACE_NAMES@ };
 
-// AXI stream descriptors {stream_name, transactions_per_inference}
-// input AXI stream descriptors
-static std::initializer_list<stream_desc> const  istream_descs { @ISTREAM_DESC@ };
+	// Which index node this simulation executes
+	// In a complete design simulation this is 0
+	constexpr size_t NodeIndex = @NODE_INDEX@;
 
-// output AXI stream descriptors
-static std::initializer_list<stream_desc> const  ostream_descs { @OSTREAM_DESC@ };
+	// Number of total nodes in the simulation (over all processes)
+	// In a complete design simulation this is 1
+	constexpr size_t TotalNodes = @TOTAL_NODES@;
 
-// number of inferences to perform
-constexpr unsigned  n_inferences = @N_INFERENCES@;
+	// sim kernel .so to use (depends on Vivado version)
+	static char const  kernel_libname[] = "@SIMKERNEL_SO@";
 
-// max number of cycles to wait for output activity on any stream before timeout
-constexpr unsigned  max_iters = @TIMEOUT_CYCLES@;
+	// design library .so to use (important to use this relative path here,
+	// due to how XSI looks for certain files)
+	static char const  design_libname[] = "xsim.dir/@TOP_MODULE_NAME@/xsimk.so";
 
-// filename for trace and debug, if enabled. This needs xelab -debug option too.
-static char const *const trace_filename = @TRACE_FILE@;
-static char const *const xsim_log_filename = @XSIM_LOG_FILE@;
+	// AXI stream descriptors {stream_name, transactions_per_inference}
+	// input AXI stream descriptors
+	constexpr std::array istream_descs { @ISTREAM_DESC@ };
+
+	// output AXI stream descriptors
+	constexpr std::array ostream_descs { @OSTREAM_DESC@ };
+
+	// max number of cycles to wait for output activity on any stream before timeout
+	constexpr unsigned max_iters = @TIMEOUT_CYCLES@;
+
+	// filename for trace and debug, if enabled. This needs xelab -debug option too.
+	static const std::optional<std::string> trace_filename = @TRACE_FILE@;
+	static const std::string xsim_log_filename = @XSIM_LOG_FILE@;
+
+}
diff --git a/finn_xsi/finn_xsi/rtlsim_xsi.cpp b/finn_xsi/finn_xsi/rtlsim_xsi.cpp
deleted file mode 100644
index d4fe79581d..0000000000
--- a/finn_xsi/finn_xsi/rtlsim_xsi.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2025, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * @brief	Driver harness demo running a FINN IP core.
- * @author	Yaman Umuroğlu <yaman.umuroglu@amd.com>
- * @author	Thomas B. Preußer <thomas.preusser@amd.com>
- ***************************************************************************/
-
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <chrono>
-#include <vector>
-#include <tuple>
-#include <functional>
-
-#include "xsi_finn.hpp"
-#include "rtlsim_config.hpp"
-
-int main(int argc, char *argv[]) {
-
-	// Load Kernel and Design
-	xsi::Kernel  kernel(kernel_libname);
-	xsi::Design  top(kernel, design_libname, xsim_log_filename, trace_filename);
-	using  Port = xsi::Port;
-	if(trace_filename) {
-		// TODO make tracing more finer-grain if possible?
-		top.trace_all();
-	}
-
-	// Ultimate Simulation Summary
-	std::string  synopsis;
-
-	{ // RTL Simulation
-
-		// Simulation Report Statistics
-		size_t  iters   = 0;
-		size_t  timeout = 0;
-		size_t  itodo = istream_descs.size();
-		size_t  otodo = ostream_descs.size();
-		size_t  omute = ostream_descs.size();
-
-		// Find I/O Streams and initialize their Status
-		struct stream_status {
-			char const *name;
-			Port &port_vld;
-			Port &port_rdy;
-
-			// Job Size and Transaction Statistics
-			size_t  job_size;
-			size_t  job_txns;  // [0:job_size]
-			size_t  total_txns;
-			size_t  first_complete; // First completion timestamp
-
-			union {
-				// Input Stream
-				struct {
-					size_t  job_ticks;      // throttle if job_size < job_ticks
-					size_t  await_iter;     // iteration allowing start of next job
-				};
-				// Output Stream
-				struct {
-					size_t  last_complete;
-					size_t  interval;
-				};
-			};
-
-		public:
-			stream_status(
-				char const *name, Port &port_vld, Port &port_rdy,
-				size_t  job_size, size_t  job_ticks
-			) : name(name), port_vld(port_vld), port_rdy(port_rdy), job_size(job_size),
-				job_txns(0), total_txns(0),
-				first_complete(0), job_ticks(job_ticks), await_iter(job_ticks) {}
-		};
-		std::vector<stream_status>  istreams;
-		std::vector<stream_status>  ostreams;
-		for(auto  t : { std::tie(istream_descs, istreams), std::tie(ostream_descs, ostreams) }) {
-			for(stream_desc const &desc : std::get<0>(t)) {
-				std::string const  name(desc.name);
-				Port *const  vld = top.getPort(name + "_tvalid");
-				Port *const  rdy = top.getPort(name + "_tready");
-				if(!vld || !rdy) {
-					std::cerr << "Unable to find controls for " << desc.name << std::endl;
-					return  1;
-				}
-
-				std::get<1>(t).emplace_back(desc.name, *vld, *rdy, desc.job_size, desc.job_ticks);
-			}
-		}
-
-		// Find Global Control & Run Startup Sequence
-		std::function<void(bool)>  cycle;
-		{
-			Port *const  clk   = top.getPort("ap_clk");
-			Port *const  clk2x = top.getPort("ap_clk2x");
-			Port *const  rst_n = top.getPort("ap_rst_n");
-			if(!clk) {
-				std::cerr << "No clock found on the design." << std::endl;
-				return  1;
-			}
-			cycle = clk2x?
-				std::function<void(bool)>([&top, clk, clk2x](bool const  up) mutable {
-					clk->set(up).write_back();
-					clk2x->set(1).write_back();
-					top.run(5);
-					clk2x->set(0).write_back();
-					top.run(5);
-				}) :
-				std::function<void(bool)>([&top, clk](bool const  up) mutable {
-					clk->set(up).write_back();
-					top.run(5);
-				});
-
-			// Reset all Inputs, Wait for Reset Period
-			for(Port &p : top.ports()) { if(p.isInput())  p.clear().write_back(); };
-			if(rst_n) {
-				for(unsigned  i = 0; i < 16; i++) { cycle(0); cycle(1); }
-				rst_n->set(1).write_back();
-			}
-		}
-
-		// Start Stream Feed and Capture
-		std::cout << "Starting data feed with idle-output timeout of " << max_iters << " cycles ...\n" << std::endl;
-
-		// Make all Inputs valid & all Outputs ready
-		for(auto &s : istreams)  s.port_vld.set(1).write_back();
-		for(auto &s : ostreams)  s.port_rdy.set(1).write_back();
-
-		// Enter Simulation Loop and track Progress
-		auto const  begin = std::chrono::steady_clock::now();
-		std::vector<std::reference_wrapper<Port>>  to_write;
-		while(true) {
-
-			//-------------------------------------------------------------------
-			// Clock down - then read signal updates from design
-			cycle(0);
-
-			// check for transactions on input streams
-			for(auto &s : istreams) {
-				bool const  vld = s.port_vld[0];
-				bool const  rdy = s.port_rdy.read()[0];
-				if(vld && !rdy)  continue;
-
-				// Track successgul Transactions
-				if(vld) {
-					s.job_txns++;
-					if(++s.total_txns == s.job_size * n_inferences)  itodo--;
-				}
-
-				// Proceed according to Throttling Rate
-				if((s.job_txns < s.job_size) || !(iters < s.await_iter)) {
-					if(s.total_txns < s.job_size * n_inferences) {
-						if(!vld)  to_write.emplace_back(s.port_vld.set(1));
-						if(s.job_txns == s.job_size) {
-							s.job_txns = 0;
-							s.await_iter = iters + s.job_ticks;
-						}
-						continue;
-					}
-				}
-				if(vld)  to_write.emplace_back(s.port_vld.set(0));
-			}
-
-			{ // check for transactions on the output streams
-				bool  dead = true;
-				for(auto &s : ostreams) {
-					if(s.port_rdy[0] && s.port_vld.read()[0]) {
-						size_t const  txns = ++s.total_txns;
-						if(txns == s.job_size) {
-							s.first_complete = iters;
-							omute--;
-						}
-						if(++s.job_txns == s.job_size) {
-							s.interval      = iters - s.last_complete;
-							s.last_complete = iters;
-							s.job_txns = 0;
-						}
-						if(txns >= s.job_size * n_inferences) {
-							if(txns == s.job_size * n_inferences)  otodo--;
-							else {
-								std::cerr << "Spurious output on " << s.name << std::endl;
-								to_write.emplace_back(s.port_rdy.set(0));
-							}
-						}
-						dead = false;
-					}
-				}
-				timeout = dead? timeout + 1 : 0;
-			}
-
-			//-------------------------------------------------------------------
-			// Clock up - then write signal updates back to design
-			cycle(1);
-
-			// Write back Ports with registered updates
-			for(Port &p : to_write)  p.write_back();
-			to_write.clear();
-
-			// Show a progress message once in a while
-			if(++iters % 10000 == 0) {
-				std::cout
-					<< '@' << iters << " ticks / "
-					<< std::chrono::duration_cast<std::chrono::seconds>(std::chrono::steady_clock::now() - begin).count() << "s:";
-				for(auto const &s : istreams) {
-					std::cout << '\t' << s.name << '=' << ((100 * s.total_txns) / (n_inferences * s.job_size)) << '%';
-				}
-				for(auto const &s : ostreams) {
-					std::cout << '\t' << s.name << '=' << ((100 * s.total_txns) / (n_inferences * s.job_size)) << '%';
-				}
-				std::cout << "\tMute Outputs: " << omute << std::endl;
-			}
-
-			// Check for exit
-			if((timeout > max_iters) || (!itodo && !otodo))  break;
-		}
-
-		size_t  total_in_txns = 0;
-		for(auto const &s : istreams)  total_in_txns += s.total_txns;
-
-		size_t  total_out_txns = 0;
-		size_t  firstout_latency = 0;
-		size_t  max_interval = 0;
-		for(auto const &s : ostreams) {
-			total_out_txns  += s.total_txns;
-			firstout_latency = std::max(firstout_latency, s.first_complete);
-			max_interval     = std::max(max_interval,     s.interval);
-		}
-
-		std::ostringstream  bld;
-		bld <<
-			"N_IN_TXNS\t" << total_in_txns << "\n"
-			"N_OUT_TXNS\t" << total_out_txns << "\n"
-			"cycles\t" << iters << "\n"
-			"N\t" << n_inferences << "\n"
-			"latency_cycles\t" << firstout_latency << "\n"
-			"interval_cycles\t" << max_interval << "\n"
-			"TIMEOUT\t" << (timeout > max_iters? "1" : "0") << "\n"
-			"UNFINISHED_INS\t" << itodo << "\n"
-			"UNFINISHED_OUTS\t" << otodo << "\n"
-			"RUNTIME_S\t" << std::chrono::duration_cast<std::chrono::seconds>(std::chrono::steady_clock::now() - begin).count();
-		synopsis = bld.str();
-
-	} // done simulation
-
-	// Dump Simulation Statistics to stdout and results.txt
-	std::cout << '\n' << synopsis << std::endl;
-
-	{ // Log error info to file
-		std::ofstream  error_file("fifosim.err", std::ios::out | std::ios::trunc);
-		error_file << top.get_error_info();
-	}
-
-	{ // Synopsis and `max_count` readings to results file
-		std::ofstream  results_file("results.txt", std::ios::out | std::ios::trunc);
-		results_file << synopsis << std::endl;
-		for(Port &p : top.ports()) {
-			if(p.isOutput()) {
-				char const *const  name = p.name();
-				if(std::strncmp(name, "maxcount", 8) == 0) {
-					p.read();
-					results_file << name << '\t' << p.as_unsigned() << std::endl;
-				}
-			}
-		}
-	}
-
-	return 0;
-}
diff --git a/finn_xsi/finn_xsi/src/AXIS_Control.cpp b/finn_xsi/finn_xsi/src/AXIS_Control.cpp
new file mode 100644
index 0000000000..9e91cbaa38
--- /dev/null
+++ b/finn_xsi/finn_xsi/src/AXIS_Control.cpp
@@ -0,0 +1,76 @@
+#include <AXIS_Control.h>
+#include <Clock.h>
+#include <Design.h>
+#include <Port.h>
+
+#include <stdexcept>
+
+std::string sanitize_prefix(const std::string& prefix) {
+    if (prefix.empty()) {
+        throw std::invalid_argument("AXI prefix cannot be empty.");
+    }
+    std::string sanitized = prefix;
+    if (sanitized.back() != '_') {
+        sanitized += "_";
+    }
+    return sanitized;
+}
+
+std::string remove_trailing_underscore(const std::string& prefix) {
+    std::string clean = prefix;
+    if (clean.back() == '_') {
+        // If checks implicitly that pop_back() doesn't have undefined behav.
+        clean.pop_back();
+    }
+    return clean;
+}
+
+AXIS_Control::AXIS_Control(xsi::Design& des, Clock& clock, size_t job_sz, const std::string& prefix)
+    : job_size(job_sz),
+      job_txns(0),
+      total_txns(0),
+      first_complete(0),
+      name(remove_trailing_underscore(sanitize_prefix(prefix))),
+      design(&des),
+      clk(&clock),
+      port_vld(&des.getPort(sanitize_prefix(prefix) + "tvalid")),
+      port_rdy(&des.getPort(sanitize_prefix(prefix) + "tready")) {}
+
+void AXIS_Control::inititialized_or_throw() {
+    if (!design || !clk || !port_rdy || !port_vld) {
+        throw std::runtime_error("AXIS Control object not correctly initialized! Aborting!");
+    }
+}
+
+void AXIS_Control::setInputValid(bool value, [[maybe_unused]] std::stop_token stoken) { port_vld->set(static_cast<unsigned int>(value)).write_back(); }
+
+bool AXIS_Control::getOutputValid([[maybe_unused]] std::stop_token stoken) noexcept { return port_vld->read().as_bool(); }
+
+void AXIS_Control::setOutputReady(bool value, [[maybe_unused]] std::stop_token stoken) { port_rdy->set(static_cast<unsigned int>(value)).write_back(); }
+bool AXIS_Control::getInputReady([[maybe_unused]] std::stop_token stoken) noexcept { return port_rdy->read().as_bool(); }
+
+// Deferred write functions
+std::reference_wrapper<xsi::Port> AXIS_Control::setValid(bool value) { return std::ref(port_vld->set(value ? 1 : 0)); }
+
+std::reference_wrapper<xsi::Port> AXIS_Control::setReady(bool value) { return std::ref(port_rdy->set(value ? 1 : 0)); }
+
+S_AXIS_Control::S_AXIS_Control(xsi::Design& des, Clock& clock, size_t job_sz, size_t job_tks, const std::string& prefix)
+    : AXIS_Control(des, clock, job_sz, prefix), job_ticks(job_tks), await_iter(job_tks) {
+    if (job_sz < 1 || job_tks < 1) {
+        throw std::invalid_argument("Job size and ticks must be greater than 0.");
+    }
+}
+
+void S_AXIS_Control::writeBack() {
+    this->port_vld->write_back();
+}
+
+M_AXIS_Control::M_AXIS_Control(xsi::Design& des, Clock& clock, size_t job_sz, const std::string& prefix) : AXIS_Control(des, clock, job_sz, prefix), lastComplete(0), interval(0) {
+    if (job_sz < 1) {
+        throw std::invalid_argument("Job size must be greater than 0.");
+    }
+}
+
+void M_AXIS_Control::writeBack() {
+    this->port_rdy->write_back();
+}
diff --git a/finn_xsi/finn_xsi/src/AXI_Control.cpp b/finn_xsi/finn_xsi/src/AXI_Control.cpp
new file mode 100644
index 0000000000..fa3c8b6f35
--- /dev/null
+++ b/finn_xsi/finn_xsi/src/AXI_Control.cpp
@@ -0,0 +1,188 @@
+#include <AXI_Control.h>
+#include <Clock.h>
+#include <Design.h>
+#include <Port.h>
+
+#include <bitset>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+using namespace xsi;
+
+// Constructor
+AXI_Control::AXI_Control(xsi::Design& des, Clock& clock, const std::string& axi_prefix) : prefix(axi_prefix), design(des), clk(clock) {
+    // Check if the prefix is valid
+    if (prefix.empty()) {
+        throw std::invalid_argument("AXI prefix cannot be empty.");
+    }
+
+    // Ensure the prefix ends with an underscore
+    if (prefix.back() != '_') {
+        prefix += "_";
+    }
+}
+
+// Helper functions for multi-bit signal handling
+void AXI_Control::writeAddr(const std::string& signal, uint32_t addr) {
+    // Convert addr to binary string
+    std::string addr_bin = std::bitset<32>(addr).to_string();
+
+    // Remove leading zeros to get the actual size used in the simulation
+    addr_bin.erase(0, addr_bin.find_first_not_of('0'));
+
+
+    // Get port size
+    Port& port = design.getPort(signal);
+    auto n_bits = port.width();
+
+    // Ensure the string is the right length
+    if (addr_bin.length() < n_bits) {
+        addr_bin = std::string(n_bits - addr_bin.length(), '0') + addr_bin;
+    } else if (addr_bin.length() > n_bits) {
+        addr_bin = addr_bin.substr(addr_bin.length() - n_bits);
+    }
+
+    port.set_binstr(addr_bin).write_back();
+}
+
+void AXI_Control::writeData(const std::string& signal, uint32_t data) {
+    // Similar to write_addr
+    std::string data_bin = std::bitset<32>(data).to_string();
+
+    // Get port size
+    Port& port = design.getPort(signal);
+    auto n_bits = port.width();
+
+    if (data_bin.length() < n_bits) {
+        data_bin = std::string(n_bits - data_bin.length(), '0') + data_bin;
+    } else if (data_bin.length() > n_bits) {
+        data_bin = data_bin.substr(data_bin.length() - n_bits);
+    }
+
+    port.set_binstr(data_bin).write_back();
+}
+
+void AXI_Control::writeStrb(const std::string& signal, uint32_t strb) {
+    // Similar to write_addr
+    std::string strb_bin = std::bitset<4>(strb).to_string();
+
+    // Get port size
+    Port& port = design.getPort(signal);
+    auto n_bits = port.width();
+
+    if (strb_bin.length() < n_bits) {
+        strb_bin = std::string(n_bits - strb_bin.length(), '0') + strb_bin;
+    } else if (strb_bin.length() > n_bits) {
+        strb_bin = strb_bin.substr(strb_bin.length() - n_bits);
+    }
+
+    port.set_binstr(strb_bin).write_back();
+}
+
+uint32_t AXI_Control::read(const std::string& signal) {
+    Port& port = design.getPort(signal);
+    return port.read().as_unsigned();
+}
+
+void AXI_Control::setBool(const std::string& signal) {
+    Port& port = design.getPort(signal);
+    port.set(1).write_back();
+}
+
+void AXI_Control::clearBool(const std::string& signal) {
+    Port& port = design.getPort(signal);
+    port.set(0).write_back();
+}
+
+bool AXI_Control::chkBool(const std::string& signal) {
+    Port& port = design.getPort(signal);
+    return port.read().as_bool();
+}
+
+void AXI_Control::writeRegister(uint32_t addr, uint32_t data) {
+    // Assert BREADY to receive response
+    setBool(prefix + "bready");
+    // Set address
+    writeAddr(prefix + "awaddr", addr);
+    // Set data and strobe (full 32-bit word)
+    writeData(prefix + "wdata", data);
+    writeStrb(prefix + "wstrb", 0xF);  // All bytes enabled
+
+    // Assert AWVALID
+    setBool(prefix + "awvalid");
+
+    // Assert WVALID
+    setBool(prefix + "wvalid");
+
+    // Wait for AWREADY
+    while (!chkBool(prefix + "awready")) {
+        clk.toggleClk();
+    }
+
+    // Wait for WREADY
+    while (!chkBool(prefix + "wready")) {
+        clk.toggleClk();
+    }
+
+    clk.toggleClk();  // Make sure that for at least one cycle the signals were set
+
+    // Deassert AWVALID and WVALID
+    clearBool(prefix + "awvalid");
+    clearBool(prefix + "wvalid");
+
+
+    // Wait for BVALID
+    while (!chkBool(prefix + "bvalid")) {
+        clk.toggleClk();
+    }
+
+    // Check BRESP (optional, could add error handling)
+    uint32_t bresp = read(prefix + "bresp");
+    if (bresp != 0) {
+        std::cerr << "AXI write error: BRESP = " << bresp << std::endl;
+    }
+
+    // Deassert BREADY
+    clearBool(prefix + "bready");
+
+    clk.toggleClk();
+}
+
+uint32_t AXI_Control::readRegister(uint32_t addr) {
+    // Assert RREADY to receive data
+    setBool(prefix + "rready");
+    // Set address
+    writeAddr(prefix + "araddr", addr);
+
+    // Assert ARVALID
+    setBool(prefix + "arvalid");
+
+    // Wait for ARREADY
+    while (!chkBool(prefix + "arready")) {
+        clk.toggleClk();
+    }
+
+    // Wait for RVALID
+    while (!chkBool(prefix + "rvalid")) {
+        clk.toggleClk();
+    }
+
+    // Deassert ARVALID
+    clearBool(prefix + "arvalid");
+
+    // Read data
+    uint32_t data = read(prefix + "rdata");
+
+    // Check RRESP (optional, could add error handling)
+    uint32_t rresp = read(prefix + "rresp");
+    if (rresp != 0) {
+        std::cerr << "AXI read error: RRESP = " << rresp << std::endl;
+    }
+
+    // Deassert RREADY
+    clearBool(prefix + "rready");
+    clk.toggleClk();
+
+    return data;
+}
diff --git a/finn_xsi/finn_xsi/src/Clock.cpp b/finn_xsi/finn_xsi/src/Clock.cpp
new file mode 100644
index 0000000000..1b5e329e93
--- /dev/null
+++ b/finn_xsi/finn_xsi/src/Clock.cpp
@@ -0,0 +1,68 @@
+#include <Clock.h>
+#include <Design.h>
+#include <Port.h>
+
+using namespace xsi;
+
+Clock::Clock(xsi::Design& des) : design(des) {
+    // Find Global Control & Run Startup Sequence
+    Port& clk = des.getPort("ap_clk");
+    auto ports = des.ports();
+
+    Port* clk2x = nullptr;
+    for (auto&& p : ports) {
+        if (p.name() == std::string("ap_clk2x")) {
+            clk2x = &p;
+            break;
+        }
+    }
+    clkHigh = clk2x ? std::function<void()>([&des, &clk, clk2x]() mutable {
+        des.run(1);
+        clk.set(1).write_back();
+        clk2x->set(1).write_back();
+        des.run(1);
+    }) : std::function<void()>([&des, &clk]() mutable {
+        des.run(1);
+        clk.set(1).write_back();
+        des.run(1);
+    });
+    clkLow = clk2x ? std::function<void()>([&des, &clk, clk2x]() mutable {
+        des.run(2499);
+        clk2x->set(0).write_back();
+        des.run(2500);
+        clk.set(0).write_back();
+        clk2x->set(1).write_back();
+        des.run(2500);
+        clk2x->set(0).write_back();
+        des.run(2499);
+
+    }) : std::function<void()>([&des, &clk]() mutable {
+        des.run(4999);
+        clk.set(0).write_back();
+        des.run(4999);
+    });
+    // cycle = clk2x ? std::function<void(bool)>([&des, &clk, clk2x](bool const up) mutable {
+    //     clk.set(up).write_back();
+    //     clk2x->set(1).write_back();
+    //     des.run(5000);
+    //     clk2x->set(0).write_back();
+    //     des.run(5000);
+    // })
+    //               : std::function<void(bool)>([&des, &clk](bool const up) mutable {
+    //                     clk.set(up).write_back();
+    //                     des.run(5000);
+    //                 });
+}
+
+void Clock::toggleClk() noexcept {
+    clkHigh();
+    clkLow();
+}
+
+void Clock::clockHigh() noexcept {
+    clkHigh();
+}
+
+void Clock::clockLow() noexcept {
+    clkLow();
+}
diff --git a/finn_xsi/finn_xsi/src/Design.cpp b/finn_xsi/finn_xsi/src/Design.cpp
new file mode 100644
index 0000000000..fcd85738eb
--- /dev/null
+++ b/finn_xsi/finn_xsi/src/Design.cpp
@@ -0,0 +1,51 @@
+#include <Design.h>
+
+using namespace xsi;
+
+// Constructors
+Design::Design(xsi::Kernel& kernel, const std::string& design_lib, const s_xsi_setup_info& setup_info) : _kernel(std::move(kernel)) { _kernel.open(design_lib, setup_info); }
+
+Design::Design(xsi::Kernel& kernel, const std::string& design_lib, const char* const log_file, const char* const wdb_file)
+    : Design(kernel, design_lib, s_xsi_setup_info{.logFileName = const_cast<char*>(log_file), .wdbFileName = const_cast<char*>(wdb_file)}) {}
+
+// Destructor
+Design::~Design() { _kernel.close(); }
+
+// Move constructor
+Design::Design(Design&& other) noexcept : _kernel(std::move(other._kernel)) {
+    // The kernel now manages the moved design
+    // No additional work needed as the kernel handles the XSI state
+}
+
+// Move assignment operator
+Design& Design::operator=(Design&& other) noexcept {
+    if (this != &other) {
+        _kernel.close();  // Close current design
+                          // Note: _kernel is a reference and cannot be reassigned
+                          // The move semantics here are limited since we hold a reference
+        _kernel = std::move(other._kernel);
+    }
+    return *this;
+}
+
+// Simulation Control & Status
+void Design::trace_all() { _kernel.xsi<xsi::Kernel::Xsi::trace_all>(); }
+
+void Design::run(const XSI_INT64 step) { _kernel.xsi<xsi::Kernel::Xsi::run>(step); }
+
+void Design::restart() { _kernel.xsi<xsi::Kernel::Xsi::restart>(); }
+
+int Design::get_status() const noexcept { return _kernel.xsi<xsi::Kernel::Xsi::get_status>(); }
+
+const char* Design::get_error_info() const noexcept { return _kernel.xsi<xsi::Kernel::Xsi::get_error_info>(); }
+
+// Port Access
+int Design::num_ports() const noexcept { return static_cast<int>(_kernel.port_count()); }
+
+xsi::Port& Design::getPort(const std::string& name) { return _kernel.getPort(name.c_str()); }
+
+const xsi::Port& Design::getPort(const std::string& name) const { return _kernel.getPort(name.c_str()); }
+
+std::span<xsi::Port> Design::ports() noexcept { return _kernel.ports(); }
+
+std::span<const xsi::Port> Design::ports() const noexcept { return _kernel.ports(); }
diff --git a/finn_xsi/finn_xsi/src/FIFO.cpp b/finn_xsi/finn_xsi/src/FIFO.cpp
new file mode 100644
index 0000000000..9cfa53aacc
--- /dev/null
+++ b/finn_xsi/finn_xsi/src/FIFO.cpp
@@ -0,0 +1,104 @@
+#include <FIFO.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+
+FIFO::FIFO(uint64_t size) : maxSize(size) {}
+FIFO::~FIFO() {}
+
+/// Prepare update for the next clock cycle.
+/// This models Q_srl behavior where:
+/// - When empty: only accepts input (ignores output ready), transitions to size 1
+/// - When non-empty: can consume, produce, or both
+/// With bounded maxSize, this models a real FIFO with backpressure.
+void FIFO::update(bool incomingValid, bool incomingReady) {
+    // When empty: only push if valid (ignoring ready)
+    // When non-empty: push if valid AND space available
+    uint64_t canPush = incomingValid & (currentUtil < maxSize);
+
+    // Q_srl behavior: when empty, only check input valid (ignore output ready)
+    // Only pop if was non-empty at start AND output ready
+    uint64_t canPop = incomingReady & (currentUtil != 0);
+
+    nextUtil = nextUtil + canPush - canPop;
+}
+
+/// Toggle the clock cycle, and update the previously set values.
+/// nextUtil is guaranteed to be in [0, maxSize] by all operations.
+/// Returns false if a first valid signal was expected, but has not been observed.
+bool FIFO::toggleClock() {
+    currentUtil = nextUtil;
+    maxUtil = std::max(maxUtil, currentUtil);
+    nextUtil = currentUtil;
+    cyclesUntilExpectedFirstValid -= static_cast<uint64_t>(static_cast<bool>(cyclesUntilExpectedFirstValid) & !static_cast<bool>(maxUtil));  // Underflow-safe decrement
+    return (cyclesUntilExpectedFirstValid == 0) & (maxUtil == 0);
+}
+
+/// Return whether the FIFO can accept inputs (for the current utilization)
+/// Uses nextUtil (post-push state) so that ready correctly reflects capacity
+/// after any push already committed this cycle, preventing AXI-S violations.
+bool FIFO::getInputReady([[maybe_unused]] std::stop_token stoken) noexcept { return nextUtil < maxSize; }
+
+/// Return whether the FIFO can output values (for the current utilization)
+bool FIFO::getOutputValid([[maybe_unused]] std::stop_token stoken) noexcept { return currentUtil > 0; }
+
+/// Return whether the FIFO is empty (for the current utilization)
+bool FIFO::isEmpty() const { return currentUtil == 0; }
+
+/// Reset the FIFOs internal state. If size is given, also set maxSize,
+/// otherwise keep it.
+void FIFO::reset(uint64_t size) {
+    currentUtil = 0;
+    maxUtil = 0;
+    maxSize = size;
+    nextUtil = 0;
+    cyclesUntilExpectedFirstValid = std::numeric_limits<uint64_t>::max();
+}
+
+void FIFO::setCyclesUntilExpectedFirstValid(uint64_t cycles) {
+    cyclesUntilExpectedFirstValid = cycles;
+    initialCyclesUntilExpectedFirstValid = cycles;
+    std::cout << "FIFO set to expect first valid after " << cycles << " cycles" << std::endl;
+}
+
+uint64_t FIFO::getCyclesUntilFirstValid() const { return initialCyclesUntilExpectedFirstValid - cyclesUntilExpectedFirstValid; }
+
+/// Set the FIFOs max size
+void FIFO::setMaxSize(const uint64_t size) { maxSize = size; }
+
+uint64_t FIFO::getMaxSize() const { return maxSize; }
+
+uint64_t FIFO::getSpaceLeft() const { return maxSize - currentUtil; }
+
+uint64_t FIFO::getMaxUtil() const { return maxUtil; }
+
+void FIFO::increaseCounter(const uint64_t count) {
+    // Branchless: compute new value and saturate at maxSize
+    uint64_t newUtil = nextUtil + count;
+    uint64_t overflow = newUtil > maxSize;
+    nextUtil = overflow ? maxSize : newUtil;
+}
+
+/// If incomingValid is true and FIFO has space, increment nextUtil
+/// Matches Q_srl: when empty, always accepts input
+/// When using tryPush/tryPop separately, ALWAYS call tryPush BEFORE tryPop!
+void FIFO::setInputValid(bool incomingValid, [[maybe_unused]] std::stop_token stoken) {
+    // When empty: accept input unconditionally (like Q_srl state_empty)
+    // When non-empty: accept if space available
+    nextUtil += incomingValid & (nextUtil < maxSize);
+}
+
+/// If incomingReady is true and FIFO has data, decrement nextUtil
+/// Matches Q_srl: only pops if data available
+/// When using tryPush/tryPop separately, ALWAYS call tryPush BEFORE tryPop!
+/// Note: If FIFO was empty and tryPush just added data, tryPop will NOT pop it
+/// (matching Q_srl where state_empty ignores output ready)
+void FIFO::setOutputReady(bool incomingReady, [[maybe_unused]] std::stop_token stoken) {
+    // Check currentUtil (state at cycle start) not nextUtil (after tryPush)
+    // This ensures empty->tryPush->tryPop results in size=1, matching Q_srl
+    nextUtil -= incomingReady & (currentUtil > 0);
+}
+
+/// Return the current number of elements in the FIFO
+uint64_t FIFO::size() const { return currentUtil; }
diff --git a/finn_xsi/finn_xsi/src/Kernel.cpp b/finn_xsi/finn_xsi/src/Kernel.cpp
new file mode 100644
index 0000000000..8b5b16657a
--- /dev/null
+++ b/finn_xsi/finn_xsi/src/Kernel.cpp
@@ -0,0 +1,168 @@
+#include <Kernel.h>
+#include <Port.h>
+#include <SharedLibrary.h>
+
+#include <iostream>
+#include <memory>
+#include <stdexcept>
+
+using namespace xsi;
+
+void* resolve_or_throw(xsi::SharedLibrary& lib, char const* const sym) {
+    auto const res = lib.getsymbol(sym);
+    if (!res) {
+        throw std::runtime_error(std::string("Failed to resolve ").append(sym).append(" in ").append(lib.path()));
+    }
+    return *res;
+}
+
+char const* const Kernel::Xsi::FUNC_NAMES[EXTENT] = {"xsi_get_value",      "xsi_put_value",
+                                                     "xsi_get_int_port",   "xsi_get_str_port",
+
+                                                     "xsi_get_int",        "xsi_get_port_number",
+
+                                                     "xsi_trace_all",      "xsi_run",
+                                                     "xsi_restart",        "xsi_get_status",
+                                                     "xsi_get_error_info",
+
+                                                     "xsi_close"};
+
+
+Kernel::Xsi::Xsi(xsi::SharedLibrary& lib) : _hdl(nullptr) {
+    // Resolve XSI Functions
+    for (unsigned i = 0; i < EXTENT; i++) {
+        _func[i] = resolve_or_throw(lib, FUNC_NAMES[i]);
+    }
+}
+
+// Xsi Move constructor
+Kernel::Xsi::Xsi(Xsi&& other) noexcept : _hdl(other._hdl) {
+    std::copy(std::begin(other._func), std::end(other._func), std::begin(_func));
+    other._hdl = nullptr;
+    std::fill(std::begin(other._func), std::end(other._func), nullptr);
+}
+
+// Xsi Move assignment operator
+Kernel::Xsi& Kernel::Xsi::operator=(Xsi&& other) noexcept {
+    if (this != &other) {
+        _hdl = other._hdl;
+        std::copy(std::begin(other._func), std::end(other._func), std::begin(_func));
+        other._hdl = nullptr;
+        std::fill(std::begin(other._func), std::end(other._func), nullptr);
+    }
+    return *this;
+}
+
+// Xsi Handle management
+void Kernel::Xsi::setHandle(xsiHandle hdl) noexcept { _hdl = hdl; }
+
+bool Kernel::Xsi::hasValidHandle() const noexcept { return _hdl != nullptr; }
+//---------------------------------------------------------------------------
+// Life Cycle
+
+// Move constructor
+Kernel::Kernel(Kernel&& other) noexcept : _kernel_lib(std::move(other._kernel_lib)), _xsi(std::move(other._xsi)), _design_lib(std::move(other._design_lib)), _ports() {
+    // Reset source
+    other._ports.clear();
+
+    // Recreate ports if design is open
+    if (_design_lib && _xsi.hasValidHandle()) {
+        // Enumerate Ports
+        unsigned const port_count = static_cast<unsigned>(xsi<Xsi::get_int>(xsiNumTopPorts));
+        _ports.reserve(port_count);
+        for (unsigned i = 0; i < port_count; ++i) {
+            _ports.emplace_back(Port(*this, i));
+        }
+    }
+}
+
+// Move assignment operator
+Kernel& Kernel::operator=(Kernel&& other) noexcept {
+    if (this != &other) {
+        // Clean up current state
+        close();
+
+        // Move from other
+        _kernel_lib = std::move(other._kernel_lib);
+        _xsi = std::move(other._xsi);
+        _design_lib = std::move(other._design_lib);
+
+        // Reset ports in source
+        other._ports.clear();
+
+        // Recreate ports if design is open
+        if (_design_lib && _xsi.hasValidHandle()) {
+            // Enumerate Ports
+            unsigned const port_count = static_cast<unsigned>(xsi<Xsi::get_int>(xsiNumTopPorts));
+            _ports.reserve(port_count);
+            for (unsigned i = 0; i < port_count; i++) {
+                _ports.emplace_back(Port(*this, i));
+            }
+        }
+    }
+    return *this;
+}
+
+Kernel::Kernel(const std::string& kernel_lib) : _kernel_lib(kernel_lib), _xsi(_kernel_lib) {}
+
+Kernel::~Kernel() {
+    if (_design_lib)
+        std::cerr << "Disposing XSI Kernel with open Design." << std::endl;
+}
+
+void Kernel::open(const std::string& design_lib, const s_xsi_setup_info& setup_info) {
+    _design_lib.open(design_lib);
+    try {
+        auto const f = t_fp_xsi_open(resolve_or_throw(_design_lib, "xsi_open"));
+        xsiHandle const hdl = f(const_cast<p_xsi_setup_info>(&setup_info));
+        if (!hdl)
+            throw std::runtime_error("Loading of design failed");
+        _xsi.setHandle(hdl);
+
+        // Enumerate Ports
+        unsigned const port_count = static_cast<unsigned>(xsi<Xsi::get_int>(xsiNumTopPorts));
+        _ports.reserve(port_count);
+        for (unsigned i = 0; i < port_count; i++) {
+            _ports.emplace_back(Port(*this, i));
+        }
+    } catch (...) {
+        std::cerr << "Exception during design open, closing design library." << std::endl;
+        _design_lib.close();
+        throw;
+    }
+}
+void Kernel::close() noexcept {
+    xsi<Xsi::close>();
+    _xsi.setHandle(nullptr);
+    _design_lib.close();
+
+    // Clear ports - unique_ptr will handle destruction automatically
+    _ports.clear();
+
+    // Clean up Library State
+    std::optional<void*> vptr = _kernel_lib.getsymbol("svTypeInfo");
+    if (vptr)
+        *vptr = nullptr;
+}
+
+Port& Kernel::getPort(const char* const name) {
+    int const id = xsi<Xsi::get_port_number>(name);
+
+    if (id == -1 || id >= static_cast<int>(_ports.size())) {
+        throw std::runtime_error(std::string("Port not found: ").append(name));
+    }
+    return _ports[static_cast<std::size_t>(id)];
+}
+const Port& Kernel::getPort(const char* const name) const {
+    int const id = xsi<Xsi::get_port_number>(name);
+
+    if (id == -1 || id >= static_cast<int>(_ports.size())) {
+        throw std::runtime_error(std::string("Port not found: ").append(name));
+    }
+    return _ports[static_cast<std::size_t>(id)];
+}
+std::span<Port> Kernel::ports() noexcept { return std::span<Port>(_ports.data(), _ports.data() + _ports.size()); }
+std::span<const Port> Kernel::ports() const noexcept { return std::span<const Port>(_ports.data(), _ports.data() + _ports.size()); }
+
+// Port count accessor for Design class
+size_t Kernel::port_count() const noexcept { return _ports.size(); }
diff --git a/finn_xsi/finn_xsi/src/Port.cpp b/finn_xsi/finn_xsi/src/Port.cpp
new file mode 100644
index 0000000000..436c2f4778
--- /dev/null
+++ b/finn_xsi/finn_xsi/src/Port.cpp
@@ -0,0 +1,208 @@
+#include <Kernel.h>
+#include <Port.h>
+#include <helper.h>
+
+using namespace xsi;
+
+Port::Port(Kernel& kernel, const unsigned id) : _kernel(kernel), _id(id), buffer((width() + 31) / 32) {}
+
+Port::Port(Port&& other) noexcept : _kernel(other._kernel), _id(other._id), buffer(std::move(other.buffer)) {
+    // Note: _kernel and _id are reference and const respectively, so they're initialized from other
+    // The buffer is moved from the other object
+}
+
+Port::~Port() noexcept {}
+
+bool Port::hasUnknown() const noexcept {
+    for (auto&& elem : buffer) {
+        if (elem.bVal)
+            return true;
+    }
+    return false;
+}
+
+bool Port::isZero() const noexcept {
+    for (auto&& elem : buffer) {
+        if (elem.aVal)
+            return false;
+    }
+    return true;
+}
+
+std::string Port::as_binstr() const {
+    unsigned const w = width();
+    std::string res(w, '?');
+
+    auto buffer_iter = buffer.cbegin();
+    auto res_iter = res.rbegin();  // Use reverse iterator to fill from right to left
+
+    uint32_t a = 0;
+    uint32_t b = 0;
+    for (unsigned i = 0; i < w; i++) {
+        if ((i & 31) == 0) {
+            a = buffer_iter->aVal;
+            b = buffer_iter->bVal;
+            ++buffer_iter;
+        }
+        *res_iter++ = XZ10[((b & 1) << 1) | (a & 1)];
+        a >>= 1;
+        b >>= 1;
+    }
+
+    return res;
+}
+
+std::string Port::as_hexstr() const {
+    unsigned l = (width() + 3) / 4;
+    std::string res(l, '?');
+    auto buffer_iter = buffer.cbegin();
+    auto res_iter = res.rbegin();  // Use reverse iterator to fill from right to left
+
+    while (l > 0) {
+        uint32_t a = buffer_iter->aVal;
+        uint32_t b = buffer_iter->bVal;
+        ++buffer_iter;
+
+        unsigned m = std::min(8u, l);
+        l -= m;
+        for (unsigned i = 0; i < m; ++i) {
+            unsigned const bm = b & 0xF;
+            unsigned const am = a & 0xF;
+
+            *res_iter++ = !bm ? HEX[am] : XZ10[3 - !(am & bm)];
+            a >>= 4;
+            b >>= 4;
+        }
+    }
+    return res;
+}
+
+Port& Port::clear() {
+    std::fill(buffer.begin(), buffer.end(), s_xsi_vlog_logicval{.aVal = 0u, .bVal = 0u});
+    return *this;
+}
+
+const char* Port::name() const noexcept { return _kernel.xsi<Kernel::Xsi::get_str_port>(static_cast<int>(_id), xsiNameTopPort); }
+
+int Port::dir() const noexcept { return _kernel.xsi<Kernel::Xsi::get_int_port>(static_cast<int>(_id), xsiDirectionTopPort); }
+
+unsigned Port::width() const noexcept { return static_cast<unsigned>(_kernel.xsi<Kernel::Xsi::get_int_port>(static_cast<int>(_id), xsiHDLValueSize)); }
+
+bool Port::isInput() const noexcept { return dir() == xsiInputPort; }
+
+bool Port::isOutput() const noexcept { return dir() == xsiOutputPort; }
+
+bool Port::isInout() const noexcept { return dir() == xsiInoutPort; }
+
+Port& Port::read() {
+    _kernel.xsi<Kernel::Xsi::get_value>(static_cast<int>(_id), buffer.data());
+    return *this;
+}
+
+void Port::write_back() { _kernel.xsi<Kernel::Xsi::put_value>(static_cast<int>(_id), buffer.data()); }
+
+bool Port::operator[](const unsigned idx) const noexcept { return (buffer[idx / 32].aVal >> (idx % 32)) & 1; }
+
+bool Port::as_bool() const noexcept { return buffer[0].aVal & 1; }
+
+unsigned Port::as_unsigned() const noexcept { return buffer[0].aVal; }
+
+Port& Port::set(const unsigned val) {
+    s_xsi_vlog_logicval* const p = buffer.data();
+    p->aVal = val;
+    p->bVal = 0;
+    return *this;
+}
+
+Port& Port::set_binstr(const std::string& val) {
+    auto val_iter = val.crbegin();  // Process from right to left
+
+    size_t chars_processed = 0;
+    const size_t val_length = val.length();
+
+    for (auto& elem : buffer) {
+        uint32_t a = 0;
+        uint32_t b = 0;
+
+        // Process up to 32 characters for this buffer element
+        const size_t chars_to_process = std::min(32UL, val_length - chars_processed);
+
+        for (size_t j = 0; j < chars_to_process; ++j) {
+            a <<= 1;
+            b <<= 1;
+
+            if (val_iter != val.crend()) {
+                switch (*val_iter++) {
+                    case '1':
+                        a |= 1;
+                        [[fallthrough]];
+                    case '0':
+                        break;
+                    default:
+                        a |= 1;
+                        [[fallthrough]];
+                    case 'Z':
+                    case 'z':
+                        b |= 1;
+                        break;
+                }
+            }
+        }
+
+        elem.aVal = a;
+        elem.bVal = b;
+
+        chars_processed += chars_to_process;
+        if (chars_processed >= val_length)
+            break;
+    }
+
+    return *this;
+}
+
+Port& Port::set_hexstr(const std::string& val) {
+    auto val_iter = val.crbegin();  // Process from right to left
+
+    size_t chars_processed = 0;
+    const size_t val_length = val.length();
+
+    for (auto& elem : buffer) {
+        uint32_t a = 0;
+        uint32_t b = 0;
+
+        // Process up to 8 hex characters (32 bits) for this buffer element
+        const size_t chars_to_process = std::min(8UL, val_length - chars_processed);
+
+        for (size_t j = 0; j < chars_to_process; ++j) {
+            a <<= 4;
+            b <<= 4;
+
+            if (val_iter != val.crend()) {
+                char c = *val_iter++;
+
+                if (('0' <= c) && c <= '9') {
+                    a |= c & 0xF;
+                } else {
+                    c |= 0x20;  // Convert to lowercase
+                    if (('a' <= c) && (c <= 'f')) {
+                        a |= static_cast<uint32_t>(c - ('a' - 10));
+                    } else {
+                        b |= 0xF;
+                        if (c != 'z') {
+                            a |= 0xF;
+                        }
+                    }
+                }
+            }
+        }
+
+        elem.aVal = a;
+        elem.bVal = b;
+
+        chars_processed += chars_to_process;
+        if (chars_processed >= val_length)
+            break;
+    }
+
+    return *this;
+}
diff --git a/finn_xsi/finn_xsi/src/SharedLibrary.cpp b/finn_xsi/finn_xsi/src/SharedLibrary.cpp
new file mode 100644
index 0000000000..81ce1ff33e
--- /dev/null
+++ b/finn_xsi/finn_xsi/src/SharedLibrary.cpp
@@ -0,0 +1,120 @@
+#include <SharedLibrary.h>
+
+#include <stdexcept>
+
+using namespace xsi;
+
+char const SharedLibrary::library_suffix[] =
+#if defined(_WIN32)
+    ".lib";
+#else
+    ".so";
+#endif
+
+#if defined(_WIN32)
+namespace {
+    std::string translate_error_message(DWORD errid) {
+        std::string msg;
+        LPTSTR bufptr;
+        FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, errid, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), &bufptr, 0, nullptr);
+        if (bufptr)
+            msg = reinterpret_cast<char*>(bufptr);
+        LocalFree(bufptr);
+        return msg;
+    }
+}  // namespace
+#endif
+
+SharedLibrary& SharedLibrary::open(const std::string& path) {
+    if (_lib)
+        throw std::runtime_error("SharedLibrary still open for " + _path);
+    _lib = load(path);
+    _path = path;
+    return *this;
+}
+
+SharedLibrary::handle_type SharedLibrary::load(const std::string& path) {
+    if (path.empty())
+        throw std::domain_error("Empty library path.");
+
+#if defined(_WIN32)
+    SetLastError(0);
+    #ifdef UNICODE
+    // Use LoadLibraryA explicitly on windows if UNICODE is defined
+    handle_type const lib = LoadLibraryA(path.c_str());
+    #else
+    handle_type const lib = LoadLibrary(path.c_str());
+    #endif
+    if (!lib)
+        throw std::runtime_error(translate_error_message(GetLastError()));
+#else
+    handle_type const lib = dlopen(path.c_str(), RTLD_LAZY | RTLD_GLOBAL);
+    if (!lib)
+        throw std::runtime_error(dlerror());
+#endif
+    return lib;
+}
+
+void SharedLibrary::unload() noexcept {
+    if (_lib) {
+#if defined(_WIN32)
+        FreeLibrary(_lib);
+#else
+        dlclose(_lib);
+#endif
+    }
+}
+
+std::optional<void*> SharedLibrary::getsymbol(const char* const name) {
+    void* sym;
+#if defined(_WIN32)
+    sym = (void*) GetProcAddress(_lib, name);
+    if (!sym)
+#else
+    dlerror();  // clear error
+    sym = dlsym(_lib, name);
+    char const* const err = dlerror();
+    if (err)
+#endif
+        return std::nullopt;
+    return std::make_optional(sym);
+}
+
+// Constructors
+SharedLibrary::SharedLibrary() : _lib(nullptr), _path() {}
+
+SharedLibrary::SharedLibrary(const std::string& path) : _lib(load(path)), _path(path) {}
+
+// Destructor
+SharedLibrary::~SharedLibrary() { unload(); }
+
+// Move constructor
+SharedLibrary::SharedLibrary(SharedLibrary&& other) noexcept : _lib(other._lib), _path(std::move(other._path)) { other._lib = nullptr; }
+
+// Move assignment operator
+SharedLibrary& SharedLibrary::operator=(SharedLibrary&& other) noexcept {
+    if (this != &other) {
+        // Clean up current state
+        unload();
+
+        // Move from other
+        _lib = other._lib;
+        _path = std::move(other._path);
+
+        // Reset other
+        other._lib = nullptr;
+    }
+    return *this;
+}
+
+// Member functions
+SharedLibrary::operator bool() const noexcept { return bool(_lib); }
+
+SharedLibrary& SharedLibrary::close() noexcept {
+    unload();
+    _lib = nullptr;
+    _path.clear();
+    return *this;
+}
+
+const std::string& SharedLibrary::path() const noexcept { return _path; }
diff --git a/finn_xsi/finn_xsi/src/SocketServer.cpp b/finn_xsi/finn_xsi/src/SocketServer.cpp
new file mode 100644
index 0000000000..e5fafe3997
--- /dev/null
+++ b/finn_xsi/finn_xsi/src/SocketServer.cpp
@@ -0,0 +1,143 @@
+#include <SocketServer.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <cstring>
+#include <iostream>
+#include <utility>
+
+SocketServer::SocketServer(std::string_view path) : socket_path(path) {}
+
+SocketServer::~SocketServer() { close_connection(); }
+
+SocketServer::SocketServer(SocketServer&& other) noexcept
+    : server_fd(std::exchange(other.server_fd, -1)), client_fd(std::exchange(other.client_fd, -1)), socket_path(std::move(other.socket_path)) {}
+
+SocketServer& SocketServer::operator=(SocketServer&& other) noexcept {
+    if (this != &other) {
+        close_connection();
+        server_fd = std::exchange(other.server_fd, -1);
+        client_fd = std::exchange(other.client_fd, -1);
+        socket_path = std::move(other.socket_path);
+    }
+    return *this;
+}
+
+void SocketServer::close_fd(int& fd) noexcept {
+    if (fd >= 0) {
+        ::close(fd);
+        fd = -1;
+    }
+}
+
+std::optional<std::string> SocketServer::initialize() {
+    // Create socket
+    server_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (server_fd < 0) {
+        return "Failed to create socket: " + std::string(strerror(errno));
+    }
+
+    // Remove existing socket file
+    unlink(socket_path.c_str());
+
+    // Bind socket
+    sockaddr_un addr{};
+    addr.sun_family = AF_UNIX;
+    strncpy(addr.sun_path, socket_path.c_str(), sizeof(addr.sun_path) - 1);
+
+    if (bind(server_fd, reinterpret_cast<sockaddr*>(&addr), sizeof(addr)) < 0) {
+        std::string error = "Failed to bind socket: " + std::string(strerror(errno));
+        close_fd(server_fd);
+        return error;
+    }
+
+    // Listen
+    if (listen(server_fd, 1) < 0) {
+        std::string error = "Failed to listen on socket: " + std::string(strerror(errno));
+        close_fd(server_fd);
+        return error;
+    }
+
+    // Accept connection
+    client_fd = accept(server_fd, nullptr, nullptr);
+    if (client_fd < 0) {
+        std::string error = "Failed to accept connection: " + std::string(strerror(errno));
+        close_fd(server_fd);
+        return error;
+    }
+
+    return std::nullopt;  // Success
+}
+
+std::optional<json> SocketServer::receive_message() {
+    if (client_fd < 0) {
+        std::cerr << "Socket not connected" << std::endl;
+        return std::nullopt;
+    }
+
+    // Read length prefix
+    uint32_t length{};
+    const ssize_t bytes_read = read(client_fd, &length, sizeof(length));
+    if (bytes_read != sizeof(length)) {
+        if (bytes_read == 0) {
+            std::cerr << "Connection closed by client" << std::endl;
+        } else {
+            std::cerr << "Failed to read message length: " << strerror(errno) << std::endl;
+        }
+        return std::nullopt;
+    }
+
+    // Read message
+    std::string buffer(length, '\0');
+    size_t total_read = 0;
+    while (total_read < length) {
+        const ssize_t n = read(client_fd, buffer.data() + total_read, length - total_read);
+        if (n <= 0) {
+            std::cerr << "Failed to read message data: " << strerror(errno) << std::endl;
+            return std::nullopt;
+        }
+        total_read += static_cast<size_t>(n);
+    }
+
+    try {
+        return json::parse(buffer);
+    } catch (const json::exception& e) {
+        std::cerr << "Failed to parse JSON: " << e.what() << std::endl;
+        return std::nullopt;
+    }
+}
+
+void SocketServer::send_message(const json& message) {
+    if (client_fd < 0) {
+        std::cerr << "Socket not connected" << std::endl;
+        return;
+    }
+
+    const std::string msg_str = message.dump();
+    const uint32_t length = static_cast<uint32_t>(msg_str.size());
+
+    // Send length prefix
+    const ssize_t bytes_written = write(client_fd, &length, sizeof(length));
+    if (bytes_written != sizeof(length)) {
+        std::cerr << "Failed to write message length: " << strerror(errno) << std::endl;
+        return;
+    }
+
+    // Send message
+    size_t total_written = 0;
+    while (total_written < length) {
+        const ssize_t n = write(client_fd, msg_str.data() + total_written, length - total_written);
+        if (n <= 0) {
+            std::cerr << "Failed to write message data: " << strerror(errno) << std::endl;
+            return;
+        }
+        total_written += static_cast<size_t>(n);
+    }
+}
+
+void SocketServer::close_connection() noexcept {
+    close_fd(client_fd);
+    close_fd(server_fd);
+    unlink(socket_path.c_str());
+}
diff --git a/finn_xsi/finn_xsi/unittests/CMakeLists.txt b/finn_xsi/finn_xsi/unittests/CMakeLists.txt
new file mode 100644
index 0000000000..20a699ae33
--- /dev/null
+++ b/finn_xsi/finn_xsi/unittests/CMakeLists.txt
@@ -0,0 +1,40 @@
+# Enable testing
+enable_testing()
+
+# Fetch Google Test
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG        v1.14.0
+)
+# For Windows: Prevent overriding the parent project's compiler/linker settings
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+# Add FIFO unit tests
+add_executable(FIFO_test FIFO_test.cpp ${CORE_SRC})
+target_link_libraries(FIFO_test PRIVATE nlohmann_json::nlohmann_json GTest::gtest_main Threads::Threads -ldl -lrt)
+target_include_directories(FIFO_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+target_include_directories(FIFO_test PUBLIC "$ENV{XILINX_VIVADO}/data/xsim/include")
+
+# Add InterprocessCommunicationChannel unit tests
+add_executable(InterprocessCommunicationChannel_test InterprocessCommunicationChannel_test.cpp)
+target_link_libraries(InterprocessCommunicationChannel_test PRIVATE GTest::gtest_main Threads::Threads -ldl -lrt nlohmann_json::nlohmann_json)
+target_include_directories(InterprocessCommunicationChannel_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include ${Boost_INCLUDE_DIRS})
+
+# Add Integration tests (FIFO + InterSimulationInterface)
+add_executable(Integration_test Integration_test.cpp ${CORE_SRC})
+target_link_libraries(Integration_test PRIVATE nlohmann_json::nlohmann_json GTest::gtest_main Threads::Threads -ldl -lrt)
+target_include_directories(Integration_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include ${Boost_INCLUDE_DIRS})
+target_include_directories(Integration_test PUBLIC "$ENV{XILINX_VIVADO}/data/xsim/include")
+
+# Register tests with CTest
+include(GoogleTest)
+gtest_discover_tests(FIFO_test)
+gtest_discover_tests(InterprocessCommunicationChannel_test)
+gtest_discover_tests(Integration_test)
+
+# Create a target to build all unittests at once
+add_custom_target(all_unittests)
+add_dependencies(all_unittests FIFO_test InterprocessCommunicationChannel_test Integration_test)
diff --git a/finn_xsi/finn_xsi/unittests/FIFO_test.cpp b/finn_xsi/finn_xsi/unittests/FIFO_test.cpp
new file mode 100644
index 0000000000..835ab0dd63
--- /dev/null
+++ b/finn_xsi/finn_xsi/unittests/FIFO_test.cpp
@@ -0,0 +1,854 @@
+#include "FIFO.h"
+
+#include <gtest/gtest.h>
+
+// Test fixture for FIFO tests
+class FIFOTest : public ::testing::Test {
+     protected:
+    void SetUp() override {
+        // Setup code if needed
+    }
+
+    void TearDown() override {
+        // Cleanup code if needed
+    }
+};
+
+// ===== Constructor and Initialization Tests =====
+
+TEST_F(FIFOTest, ConstructorWithDefaultSize) {
+    FIFO fifo;
+    EXPECT_TRUE(fifo.isEmpty());
+    EXPECT_TRUE(fifo.getInputReady());
+    EXPECT_FALSE(fifo.getOutputValid());
+}
+
+TEST_F(FIFOTest, ConstructorWithSpecificSize) {
+    FIFO fifo(10);
+    EXPECT_TRUE(fifo.isEmpty());
+    EXPECT_TRUE(fifo.getInputReady());
+    EXPECT_FALSE(fifo.getOutputValid());
+    EXPECT_EQ(fifo.getSpaceLeft(), 10);
+}
+
+TEST_F(FIFOTest, ConstructorWithZeroSize) {
+    FIFO fifo(0);
+    EXPECT_TRUE(fifo.isEmpty());
+    EXPECT_FALSE(fifo.getInputReady());
+    EXPECT_FALSE(fifo.getOutputValid());
+    EXPECT_EQ(fifo.getSpaceLeft(), 0);
+}
+
+// ===== Reset Tests =====
+
+TEST_F(FIFOTest, ResetClearsState) {
+    FIFO fifo(10);
+    fifo.update(true, false);  // Add one element
+    fifo.toggleClock();
+    EXPECT_FALSE(fifo.isEmpty());
+
+    fifo.reset(10);
+    EXPECT_TRUE(fifo.isEmpty());
+    EXPECT_EQ(fifo.getSpaceLeft(), 10);
+}
+
+TEST_F(FIFOTest, ResetChangesSize) {
+    FIFO fifo(10);
+    fifo.reset(20);
+    EXPECT_EQ(fifo.getSpaceLeft(), 20);
+}
+
+TEST_F(FIFOTest, SetMaxSize) {
+    FIFO fifo(10);
+    fifo.setMaxSize(15);
+    EXPECT_EQ(fifo.getSpaceLeft(), 15);
+}
+
+// ===== Basic Update and Toggle Tests =====
+
+TEST_F(FIFOTest, PushOneElement) {
+    FIFO fifo(10);
+    fifo.update(true, false);  // Push (valid=true, ready=false)
+    fifo.toggleClock();
+
+    EXPECT_FALSE(fifo.isEmpty());
+    EXPECT_TRUE(fifo.getOutputValid());
+    EXPECT_TRUE(fifo.getInputReady());
+    EXPECT_EQ(fifo.getSpaceLeft(), 9);
+}
+
+TEST_F(FIFOTest, PopOneElement) {
+    FIFO fifo(10);
+    // First push an element
+    fifo.update(true, false);
+    fifo.toggleClock();
+
+    // Then pop it
+    fifo.update(false, true);  // Pop (valid=false, ready=true)
+    fifo.toggleClock();
+
+    EXPECT_TRUE(fifo.isEmpty());
+    EXPECT_FALSE(fifo.getOutputValid());
+    EXPECT_EQ(fifo.getSpaceLeft(), 10);
+}
+
+TEST_F(FIFOTest, PushAndPopSimultaneously) {
+    FIFO fifo(10);
+    // First push an element
+    fifo.update(true, false);
+    fifo.toggleClock();
+
+    // Now push and pop simultaneously (FIFO size should stay the same)
+    fifo.update(true, true);
+    fifo.toggleClock();
+
+    EXPECT_FALSE(fifo.isEmpty());
+    EXPECT_TRUE(fifo.getOutputValid());
+    EXPECT_EQ(fifo.getSpaceLeft(), 9);
+}
+
+// ===== Boundary Condition Tests =====
+
+TEST_F(FIFOTest, FillToCapacity) {
+    FIFO fifo(3);
+
+    for (int i = 0; i < 3; ++i) {
+        fifo.update(true, false);
+        fifo.toggleClock();
+    }
+
+    EXPECT_FALSE(fifo.isEmpty());
+    EXPECT_TRUE(fifo.getOutputValid());
+    EXPECT_FALSE(fifo.getInputReady());
+    EXPECT_EQ(fifo.getSpaceLeft(), 0);
+}
+
+TEST_F(FIFOTest, CannotPushWhenFull) {
+    FIFO fifo(2);
+
+    // Fill the FIFO
+    fifo.update(true, false);
+    fifo.toggleClock();
+    fifo.update(true, false);
+    fifo.toggleClock();
+
+    EXPECT_FALSE(fifo.getInputReady());
+
+    // Try to push when full (should have no effect)
+    fifo.update(true, false);
+    fifo.toggleClock();
+
+    EXPECT_EQ(fifo.getSpaceLeft(), 0);
+}
+
+TEST_F(FIFOTest, CanPushAndPullWhenFull) {
+    FIFO fifo(2);
+
+    // Fill the FIFO
+    fifo.update(true, false);
+    fifo.toggleClock();
+    fifo.update(true, false);
+    fifo.toggleClock();
+
+    EXPECT_FALSE(fifo.getInputReady());
+
+    // Try to push and pull when full (should have no effect)
+    fifo.update(true, true);
+    fifo.toggleClock();
+
+    EXPECT_EQ(fifo.getSpaceLeft(), 1);
+
+    fifo.reset(2);
+
+    // Fill the FIFO
+    fifo.update(true, false);
+    fifo.toggleClock();
+    fifo.update(true, false);
+    fifo.toggleClock();
+
+    EXPECT_FALSE(fifo.getInputReady());
+
+    // Try to push and pull when full (should have no effect)
+    fifo.update(false, true);
+    fifo.toggleClock();
+
+    EXPECT_EQ(fifo.getSpaceLeft(), 1);
+}
+
+TEST_F(FIFOTest, CannotPopWhenEmpty) {
+    FIFO fifo(10);
+
+    EXPECT_TRUE(fifo.isEmpty());
+
+    // Try to pop when empty (should have no effect)
+    fifo.update(false, true);
+    fifo.toggleClock();
+
+    EXPECT_TRUE(fifo.isEmpty());
+    EXPECT_EQ(fifo.getSpaceLeft(), 10);
+}
+
+TEST_F(FIFOTest, CanPushAndPopWhenEmpty) {
+    FIFO fifo(10);
+
+    EXPECT_TRUE(fifo.isEmpty());
+
+    // Try to pop when empty (should have no effect)
+    fifo.update(true, true);
+    fifo.toggleClock();
+
+    EXPECT_FALSE(fifo.isEmpty());
+    EXPECT_TRUE(fifo.getOutputValid());
+    EXPECT_EQ(fifo.getSpaceLeft(), 9);
+}
+
+TEST_F(FIFOTest, PopWhenFullMakesSpaceAvailable) {
+    FIFO fifo(2);
+
+    // Fill the FIFO
+    fifo.update(true, false);
+    fifo.toggleClock();
+    fifo.update(true, false);
+    fifo.toggleClock();
+
+    EXPECT_FALSE(fifo.getInputReady());
+
+    // Pop one element
+    fifo.update(false, true);
+    fifo.toggleClock();
+
+    EXPECT_TRUE(fifo.getInputReady());
+    EXPECT_EQ(fifo.getSpaceLeft(), 1);
+}
+
+// ===== Sequential Operation Tests =====
+
+TEST_F(FIFOTest, SequentialPushAndPop) {
+    FIFO fifo(5);
+
+    // Push 3 elements
+    for (int i = 0; i < 3; ++i) {
+        fifo.update(true, false);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.getSpaceLeft(), 2);
+
+    // Pop 2 elements
+    for (int i = 0; i < 2; ++i) {
+        fifo.update(false, true);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.getSpaceLeft(), 4);
+
+    // Pop 1 more
+    fifo.update(false, true);
+    fifo.toggleClock();
+    EXPECT_TRUE(fifo.isEmpty());
+}
+
+TEST_F(FIFOTest, AlternatingPushPop) {
+    FIFO fifo(10);
+
+    for (int i = 0; i < 5; ++i) {
+        // Push
+        fifo.update(true, false);
+        fifo.toggleClock();
+        EXPECT_FALSE(fifo.isEmpty());
+
+        // Pop
+        fifo.update(false, true);
+        fifo.toggleClock();
+        EXPECT_TRUE(fifo.isEmpty());
+    }
+}
+
+TEST_F(FIFOTest, StreamingOperation) {
+    FIFO fifo(10);
+
+    // Push one element first
+    fifo.update(true, false);
+    fifo.toggleClock();
+
+    // Now stream: push and pop simultaneously for multiple cycles
+    for (int i = 0; i < 100; ++i) {
+        fifo.update(true, true);
+        fifo.toggleClock();
+        EXPECT_EQ(fifo.getSpaceLeft(), 9);  // Size should remain constant
+    }
+}
+
+// ===== State Query Tests =====
+
+TEST_F(FIFOTest, IsEmptyCorrectly) {
+    FIFO fifo(5);
+    EXPECT_TRUE(fifo.isEmpty());
+
+    fifo.update(true, false);
+    fifo.toggleClock();
+    EXPECT_FALSE(fifo.isEmpty());
+
+    fifo.update(false, true);
+    fifo.toggleClock();
+    EXPECT_TRUE(fifo.isEmpty());
+}
+
+TEST_F(FIFOTest, IsInputReadyCorrectly) {
+    FIFO fifo(2);
+    EXPECT_TRUE(fifo.getInputReady());
+
+    fifo.update(true, false);
+    fifo.toggleClock();
+    EXPECT_TRUE(fifo.getInputReady());
+
+    fifo.update(true, false);
+    fifo.toggleClock();
+    EXPECT_FALSE(fifo.getInputReady());
+}
+
+TEST_F(FIFOTest, IsOutputValidCorrectly) {
+    FIFO fifo(5);
+    EXPECT_FALSE(fifo.getOutputValid());
+
+    fifo.update(true, false);
+    fifo.toggleClock();
+    EXPECT_TRUE(fifo.getOutputValid());
+
+    fifo.update(false, true);
+    fifo.toggleClock();
+    EXPECT_FALSE(fifo.getOutputValid());
+}
+
+TEST_F(FIFOTest, GetSpaceLeftCorrectly) {
+    FIFO fifo(10);
+    EXPECT_EQ(fifo.getSpaceLeft(), 10);
+
+    for (int i = 0; i < 3; ++i) {
+        fifo.update(true, false);
+        fifo.toggleClock();
+        EXPECT_EQ(fifo.getSpaceLeft(), 10 - i - 1);
+    }
+
+    fifo.update(false, true);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.getSpaceLeft(), 8);
+}
+
+// ===== Edge Case Tests =====
+
+TEST_F(FIFOTest, NoUpdateBeforeToggle) {
+    FIFO fifo(10);
+    fifo.toggleClock();  // Toggle without update
+
+    EXPECT_TRUE(fifo.isEmpty());
+    EXPECT_EQ(fifo.getSpaceLeft(), 10);
+}
+
+TEST_F(FIFOTest, LargeCapacity) {
+    FIFO fifo(1000000);
+    EXPECT_EQ(fifo.getSpaceLeft(), 1000000);
+
+    for (int i = 0; i < 100; ++i) {
+        fifo.update(true, false);
+        fifo.toggleClock();
+    }
+
+    EXPECT_EQ(fifo.getSpaceLeft(), 999900);
+}
+
+// ===== IncreaseCounter Tests =====
+
+TEST_F(FIFOTest, IncreaseCounterBasic) {
+    FIFO fifo(100);
+
+    fifo.update(true, false);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.getSpaceLeft(), 99);
+
+    fifo.increaseCounter(5);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.getSpaceLeft(), 94);
+}
+
+TEST_F(FIFOTest, IncreaseCounterOnEmptyFIFO) {
+    FIFO fifo(100);
+
+    fifo.increaseCounter(10);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.getSpaceLeft(), 90);
+}
+
+TEST_F(FIFOTest, IncreaseCounterZero) {
+    FIFO fifo(100);
+
+    fifo.update(true, false);
+    fifo.toggleClock();
+
+    fifo.increaseCounter(0);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.getSpaceLeft(), 99);
+}
+
+// ===== Complex Scenarios =====
+
+TEST_F(FIFOTest, BurstTrafficPattern) {
+    FIFO fifo(20);
+
+    // Burst of 10 pushes
+    for (int i = 0; i < 10; ++i) {
+        fifo.update(true, false);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.getSpaceLeft(), 10);
+
+    // Burst of 10 pops
+    for (int i = 0; i < 10; ++i) {
+        fifo.update(false, true);
+        fifo.toggleClock();
+    }
+    EXPECT_TRUE(fifo.isEmpty());
+}
+
+TEST_F(FIFOTest, StressTestManyOperations) {
+    FIFO fifo(100);
+
+    // Perform 1000 operations
+    for (int i = 0; i < 500; ++i) {
+        fifo.update(true, false);
+        fifo.toggleClock();
+    }
+
+    for (int i = 0; i < 500; ++i) {
+        fifo.update(false, true);
+        fifo.toggleClock();
+    }
+
+    EXPECT_TRUE(fifo.isEmpty());
+}
+
+// ===== Multiple FIFO Instances =====
+
+TEST_F(FIFOTest, MultipleFIFOsIndependent) {
+    FIFO fifo1(10);
+    FIFO fifo2(20);
+
+    fifo1.update(true, false);
+    fifo1.toggleClock();
+
+    EXPECT_EQ(fifo1.getSpaceLeft(), 9);
+    EXPECT_EQ(fifo2.getSpaceLeft(), 20);
+
+    fifo2.update(true, false);
+    fifo2.update(true, false);
+    fifo2.toggleClock();
+    fifo2.toggleClock();
+
+    // fifo2 should have 2 elements (last update takes effect)
+    EXPECT_EQ(fifo1.getSpaceLeft(), 9);
+    EXPECT_TRUE(fifo2.getSpaceLeft() < 20);
+}
+
+// ===== Individual Method Tests =====
+
+TEST_F(FIFOTest, TryPushBasic) {
+    FIFO fifo(10);
+    EXPECT_EQ(fifo.size(), 0);
+
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+
+    EXPECT_EQ(fifo.size(), 1);
+    EXPECT_FALSE(fifo.isEmpty());
+    EXPECT_TRUE(fifo.getOutputValid());
+}
+
+TEST_F(FIFOTest, TryPushFalseDoesNothing) {
+    FIFO fifo(10);
+
+    fifo.setInputValid(false);
+    fifo.toggleClock();
+
+    EXPECT_EQ(fifo.size(), 0);
+    EXPECT_TRUE(fifo.isEmpty());
+}
+
+TEST_F(FIFOTest, TryPushMultiple) {
+    FIFO fifo(10);
+
+    for (int i = 0; i < 5; ++i) {
+        fifo.setInputValid(true);
+        fifo.toggleClock();
+    }
+
+    EXPECT_EQ(fifo.size(), 5);
+    EXPECT_EQ(fifo.getSpaceLeft(), 5);
+}
+
+TEST_F(FIFOTest, TryPushWhenFull) {
+    FIFO fifo(3);
+
+    // Fill the FIFO
+    for (int i = 0; i < 3; ++i) {
+        fifo.setInputValid(true);
+        fifo.toggleClock();
+    }
+
+    EXPECT_EQ(fifo.size(), 3);
+    EXPECT_FALSE(fifo.getInputReady());
+
+    // Try to push when full (should have no effect)
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+
+    EXPECT_EQ(fifo.size(), 3);
+}
+
+TEST_F(FIFOTest, TryPopBasic) {
+    FIFO fifo(10);
+
+    // First push an element
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.size(), 1);
+
+    // Then pop it
+    fifo.setOutputReady(true);
+    fifo.toggleClock();
+
+    EXPECT_EQ(fifo.size(), 0);
+    EXPECT_TRUE(fifo.isEmpty());
+}
+
+TEST_F(FIFOTest, TryPopFalseDoesNothing) {
+    FIFO fifo(10);
+
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+
+    fifo.setOutputReady(false);
+    fifo.toggleClock();
+
+    EXPECT_EQ(fifo.size(), 1);
+}
+
+TEST_F(FIFOTest, TryPopWhenEmpty) {
+    FIFO fifo(10);
+
+    EXPECT_TRUE(fifo.isEmpty());
+
+    // Try to pop when empty (should have no effect)
+    fifo.setOutputReady(true);
+    fifo.toggleClock();
+
+    EXPECT_TRUE(fifo.isEmpty());
+    EXPECT_EQ(fifo.size(), 0);
+}
+
+TEST_F(FIFOTest, TryPushAndTryPopSameCycle) {
+    FIFO fifo(10);
+
+    // Push first element
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.size(), 1);
+
+    // Push and pop in same cycle (order: push then pop)
+    fifo.setInputValid(true);
+    fifo.setOutputReady(true);
+    fifo.toggleClock();
+
+    // Should still have 1 element (pushed 1, popped 1)
+    EXPECT_EQ(fifo.size(), 1);
+
+    // Push and pop in same cycle (order: push then pop)
+    fifo.setOutputReady(true);
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+
+    // Should still have 1 element (pushed 1, popped 1)
+    EXPECT_EQ(fifo.size(), 1);
+}
+
+TEST_F(FIFOTest, TryPushAndTryPopSameCycleEmptyFIFO) {
+    FIFO fifo(10);
+
+    // Push and pop in same cycle (order: push then pop)
+    fifo.setInputValid(true);
+    fifo.setOutputReady(true);
+    fifo.toggleClock();
+
+    // Should still have 1 element (pushed 1, popped 0, because was empty)
+    EXPECT_EQ(fifo.size(), 1);
+
+    fifo.reset(10);
+    // Push and pop in same cycle (order: push then pop)
+    fifo.setInputValid(true);
+    fifo.setOutputReady(false);
+    fifo.toggleClock();
+
+    // Should still have 0 element (pushed 1, popped 0)
+    EXPECT_EQ(fifo.size(), 1);
+}
+
+TEST_F(FIFOTest, TryPushAndTryPopSameCycleFullFIFO) {
+    FIFO fifo(1);
+
+    // Push first element
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.size(), 1);
+    EXPECT_FALSE(fifo.getInputReady());
+
+    // Push and pop in same cycle (order: push then pop)
+    fifo.setInputValid(true);
+    fifo.setOutputReady(true);
+    fifo.toggleClock();
+
+    // Should still have 1 element (pushed 1, popped 1)
+    EXPECT_EQ(fifo.size(), 0);
+
+    fifo.reset(1);
+
+    // Push first element
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.size(), 1);
+    EXPECT_FALSE(fifo.getInputReady());
+
+    // Push and pop in same cycle (order: push then pop)
+    fifo.setInputValid(false);
+    fifo.setOutputReady(true);
+    fifo.toggleClock();
+
+    // Should still have 1 element (pushed 1, popped 1)
+    EXPECT_EQ(fifo.size(), 0);
+}
+
+TEST_F(FIFOTest, TryPushAndTryPopSequence) {
+    FIFO fifo(10);
+
+    // Push 3
+    for (int i = 0; i < 3; ++i) {
+        fifo.setInputValid(true);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.size(), 3);
+
+    // Pop 2
+    for (int i = 0; i < 2; ++i) {
+        fifo.setOutputReady(true);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.size(), 1);
+
+    // Push 1 more
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.size(), 2);
+}
+
+TEST_F(FIFOTest, TryPushAndTryPopStreaming) {
+    FIFO fifo(10);
+
+    // Initialize with one element
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+
+    // Stream: push and pop simultaneously for many cycles
+    for (int i = 0; i < 100; ++i) {
+        fifo.setInputValid(true);
+        fifo.setOutputReady(true);
+        fifo.toggleClock();
+        EXPECT_EQ(fifo.size(), 1);  // Size should remain constant
+    }
+}
+
+TEST_F(FIFOTest, TryPushAlternatingValid) {
+    FIFO fifo(10);
+
+    for (int i = 0; i < 10; ++i) {
+        fifo.setInputValid(i % 2 == 0);  // Push only on even iterations
+        fifo.toggleClock();
+    }
+
+    EXPECT_EQ(fifo.size(), 5);  // Should have 5 elements
+}
+
+TEST_F(FIFOTest, TryPopAlternatingReady) {
+    FIFO fifo(10);
+
+    // Fill with 6 elements
+    for (int i = 0; i < 6; ++i) {
+        fifo.setInputValid(true);
+        fifo.toggleClock();
+    }
+
+    // Pop alternating
+    for (int i = 0; i < 10; ++i) {
+        fifo.setOutputReady(i % 2 == 0);  // Pop only on even iterations
+        fifo.toggleClock();
+    }
+
+    EXPECT_EQ(fifo.size(), 1);  // 6 - 5 pops = 1
+}
+
+TEST_F(FIFOTest, SizeMethodCorrectness) {
+    FIFO fifo(20);
+
+    EXPECT_EQ(fifo.size(), 0);
+
+    for (int i = 1; i <= 10; ++i) {
+        fifo.setInputValid(true);
+        fifo.toggleClock();
+        EXPECT_EQ(fifo.size(), i);
+    }
+
+    for (int i = 9; i >= 0; --i) {
+        fifo.setOutputReady(true);
+        fifo.toggleClock();
+        EXPECT_EQ(fifo.size(), i);
+    }
+}
+
+TEST_F(FIFOTest, TryMethodsVsUpdateEquivalence) {
+    FIFO fifo1(10);
+    FIFO fifo2(10);
+
+    // Use update() on fifo1
+    fifo1.update(true, false);  // Push
+    fifo1.toggleClock();
+    fifo1.update(true, false);  // Push
+    fifo1.toggleClock();
+    fifo1.update(false, true);  // Pop
+    fifo1.toggleClock();
+
+    // Use tryPush/tryPop on fifo2
+    fifo2.setInputValid(true);
+    fifo2.toggleClock();
+    fifo2.setInputValid(true);
+    fifo2.toggleClock();
+    fifo2.setOutputReady(true);
+    fifo2.toggleClock();
+
+    // Should have same result
+    EXPECT_EQ(fifo1.size(), fifo2.size());
+    EXPECT_EQ(fifo1.isEmpty(), fifo2.isEmpty());
+    EXPECT_EQ(fifo1.getOutputValid(), fifo2.getOutputValid());
+}
+
+TEST_F(FIFOTest, TryMethodsBurstPattern) {
+    FIFO fifo(50);
+
+    // Burst of pushes
+    for (int i = 0; i < 30; ++i) {
+        fifo.setInputValid(true);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.size(), 30);
+
+    // Burst of pops
+    for (int i = 0; i < 20; ++i) {
+        fifo.setOutputReady(true);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.size(), 10);
+
+    // Mixed burst
+    for (int i = 0; i < 15; ++i) {
+        fifo.setInputValid(true);
+        fifo.setOutputReady(true);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.size(), 10);  // Should remain constant
+}
+
+TEST_F(FIFOTest, TryMethodsStressTest) {
+    FIFO fifo(1000);
+
+    // Complex pattern
+    for (int i = 0; i < 500; ++i) {
+        fifo.setInputValid(i % 3 != 0);  // Push 2 out of 3 times
+        if (i > 100) {
+            fifo.setOutputReady(i % 2 == 0);  // Pop every other time after 100
+        }
+        fifo.toggleClock();
+    }
+
+    // Verify FIFO is in valid state
+    EXPECT_LE(fifo.size(), 1000);
+    EXPECT_EQ(fifo.size() == 0, fifo.isEmpty());
+    EXPECT_EQ(fifo.size() > 0, fifo.getOutputValid());
+}
+
+TEST_F(FIFOTest, TryMethodsEdgeCaseFullToEmpty) {
+    FIFO fifo(5);
+
+    // Fill completely
+    for (int i = 0; i < 5; ++i) {
+        fifo.setInputValid(true);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.size(), 5);
+    EXPECT_FALSE(fifo.getInputReady());
+
+    // Empty completely
+    for (int i = 0; i < 5; ++i) {
+        fifo.setOutputReady(true);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.size(), 0);
+    EXPECT_TRUE(fifo.isEmpty());
+    EXPECT_FALSE(fifo.getOutputValid());
+}
+
+TEST_F(FIFOTest, TryMethodsWithReset) {
+    FIFO fifo(10);
+
+    // Add some elements
+    for (int i = 0; i < 5; ++i) {
+        fifo.setInputValid(true);
+        fifo.toggleClock();
+    }
+    EXPECT_EQ(fifo.size(), 5);
+
+    // Reset
+    fifo.reset(10);
+    EXPECT_EQ(fifo.size(), 0);
+
+    // Should work normally after reset
+    fifo.setInputValid(true);
+    fifo.toggleClock();
+    EXPECT_EQ(fifo.size(), 1);
+}
+
+TEST_F(FIFOTest, TestTimeout){
+    FIFO fifo(10);
+    fifo.setCyclesUntilExpectedFirstValid(3);
+    EXPECT_TRUE(fifo.toggleClock());  // 3 cycles left
+    EXPECT_TRUE(fifo.toggleClock());  // 2 cycles left
+    EXPECT_FALSE(fifo.toggleClock());  // 0 cycles left, should return false
+
+    fifo.reset(10);
+    fifo.setCyclesUntilExpectedFirstValid(2);
+    EXPECT_TRUE(fifo.toggleClock());  // 2 cycles left
+    fifo.update(true, false);        // Set valid, should disable timeout
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+
+    fifo.reset(10);
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+    EXPECT_TRUE(fifo.toggleClock());  // Still should return true
+}
+
+// Main function to run all tests
+int main(int argc, char** argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/finn_xsi/finn_xsi/unittests/Integration_test.cpp b/finn_xsi/finn_xsi/unittests/Integration_test.cpp
new file mode 100644
index 0000000000..2edb9d38da
--- /dev/null
+++ b/finn_xsi/finn_xsi/unittests/Integration_test.cpp
@@ -0,0 +1,642 @@
+#include <gtest/gtest.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <cstddef>
+
+#include "FIFO.h"
+#include "InterprocessCommunicationChannel.hpp"
+
+// Test fixture for integration tests
+class IntegrationTest : public ::testing::Test {
+     protected:
+    std::string shmName;
+
+    void SetUp() override {
+        // Generate unique shared memory name for each test
+        shmName = "test_shm_integration_" + std::to_string(getpid());
+
+        // Clean up any leftover shared memory from previous runs
+        boost::interprocess::shared_memory_object::remove(shmName.c_str());
+    }
+
+    void TearDown() override {
+        // Clean up shared memory after test
+        boost::interprocess::shared_memory_object::remove(shmName.c_str());
+    }
+};
+
+class SimDummy {
+    bool currentValid = false;
+    bool currentReady = true;
+    bool nextValid = false;
+    bool nextReady = true;
+
+     public:
+    bool isOutputValid() const { return currentValid; }
+    void toggleClock() {
+        currentValid = nextValid;
+        currentReady = nextReady;
+    }
+    bool isInputReady() const { return currentReady; }
+    void setNextValid(bool v) { nextValid = v; }
+    void setNextReady(bool r) { nextReady = r; }
+};
+
+// ===== Basic Integration Tests =====
+
+TEST_F(IntegrationTest, OneCycleReadyFalseValidFalse) {
+    // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair
+    // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation
+
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver with FIFO output to SimDummy
+        int receivedCount = 0;
+        {
+            InterprocessCommunicationChannel<bool, bool, false> receiver(shmName);
+            FIFO outputFifo(15);
+            SimDummy simDummy;
+
+            simDummy.setNextReady(false);
+            bool readySignal = outputFifo.getInputReady();
+            bool validSignal = receiver.receive_request();
+            if (validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 1 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 15) {
+                exit(3);
+            }
+            if (outputFifo.getInputReady() != true) {
+                exit(4);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (simDummy.isOutputValid()) {
+                exit(1);
+            }
+        }  // Destructor called here
+        exit(0);
+    }
+
+    // Parent process: Sender
+    {
+        InterprocessCommunicationChannel<bool, bool, true> sender(shmName);
+
+        bool validSignal = false;
+        bool incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 0; expect ready==false for cycle 1
+
+    }  // Destructor called here
+
+    // Wait for child
+    int status;
+    waitpid(pid, &status, 0);
+    EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST_F(IntegrationTest, OneCycleReadyTrueValidFalse) {
+    // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair
+    // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation
+
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver with FIFO output to SimDummy
+        int receivedCount = 0;
+        {
+            InterprocessCommunicationChannel<bool, bool, false> receiver(shmName);
+            FIFO outputFifo(15);
+            SimDummy simDummy;
+
+            simDummy.setNextReady(true);
+            bool readySignal = outputFifo.getInputReady();
+            bool validSignal = receiver.receive_request();
+            if (validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 1 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 15) {
+                exit(3);
+            }
+            if (outputFifo.getInputReady() != true) {
+                exit(4);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (simDummy.isOutputValid()) {
+                exit(1);
+            }
+        }  // Destructor called here
+        exit(0);
+    }
+
+    // Parent process: Sender
+    {
+        InterprocessCommunicationChannel<bool, bool, true> sender(shmName);
+
+        bool validSignal = false;
+        bool incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);
+
+    }  // Destructor called here
+
+    // Wait for child
+    int status;
+    waitpid(pid, &status, 0);
+    EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST_F(IntegrationTest, OneCycleReadyFalseValidTrue) {
+    // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair
+    // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation
+
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver with FIFO output to SimDummy
+        int receivedCount = 0;
+        {
+            InterprocessCommunicationChannel<bool, bool, false> receiver(shmName);
+            FIFO outputFifo(15);
+            SimDummy simDummy;
+
+            simDummy.setNextReady(false);
+            bool readySignal = outputFifo.getInputReady();
+            bool validSignal = receiver.receive_request();
+            if (!validSignal) {  // It is correct that valid is true here, because we only have a single cycle and the sender input is set to valid in cycle 0. Therefore, we should
+                                 // receive a valid in cycle 0.
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 1 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 14) {
+                exit(3);
+            }
+            if (outputFifo.getInputReady() != true) {
+                exit(4);
+            }
+            if (!outputFifo.getOutputValid()) {
+                exit(5);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (!simDummy.isOutputValid()) {
+                exit(1);
+            }
+        }  // Destructor called here
+        exit(0);
+    }
+
+    // Parent process: Sender
+    {
+        InterprocessCommunicationChannel<bool, bool, true> sender(shmName);
+
+        bool validSignal = true;
+        bool incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 0; expect ready==true for cycle 1
+
+    }  // Destructor called here
+
+    // Wait for child
+    int status;
+    waitpid(pid, &status, 0);
+    EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST_F(IntegrationTest, OneCycleReadyTrueValidTrue) {
+    // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair
+    // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation
+
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver with FIFO output to SimDummy
+        int receivedCount = 0;
+        {
+            InterprocessCommunicationChannel<bool, bool, false> receiver(shmName);
+            FIFO outputFifo(15);
+            SimDummy simDummy;
+
+            simDummy.setNextReady(true);
+            bool readySignal = outputFifo.getInputReady();
+            bool validSignal = receiver.receive_request();
+            if (!validSignal) {  // It is correct that valid is true here, because we only have a single cycle and the sender input is set to valid in cycle 0. Therefore, we should
+                                 // receive a valid in cycle 0.
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 1 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 14) {
+                exit(3);
+            }
+            if (outputFifo.getInputReady() != true) {
+                exit(4);
+            }
+            if (!outputFifo.getOutputValid()) {
+                exit(5);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (!simDummy.isOutputValid()) {
+                exit(1);
+            }
+        }  // Destructor called here
+        exit(0);
+    }
+
+    // Parent process: Sender
+    {
+        InterprocessCommunicationChannel<bool, bool, true> sender(shmName);
+
+        bool validSignal = true;
+        bool incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);
+
+    }  // Destructor called here
+
+    // Wait for child
+    int status;
+    waitpid(pid, &status, 0);
+    EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+// ===== Multicycle Integration Tests =====
+
+TEST_F(IntegrationTest, TwoCycleReadyFalseValidFalse) {
+    // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair
+    // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation
+
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver with FIFO output to SimDummy
+        int receivedCount = 0;
+        {
+            InterprocessCommunicationChannel<bool, bool, false> receiver(shmName);
+            FIFO outputFifo(15);
+            SimDummy simDummy;
+
+            simDummy.setNextReady(false);
+            bool readySignal = outputFifo.getInputReady();
+            bool validSignal = receiver.receive_request();
+            if (validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 1 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 15) {
+                exit(3);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (simDummy.isOutputValid()) {
+                exit(1);
+            }
+
+            simDummy.setNextReady(false);
+            readySignal = outputFifo.getInputReady();  // Should be true
+            validSignal = receiver.receive_request();
+            if (validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 2 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 15) {
+                exit(3);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (simDummy.isOutputValid()) {
+                exit(1);
+            }
+
+        }  // Destructor called here
+        exit(0);
+    }
+
+    // Parent process: Sender
+    {
+        InterprocessCommunicationChannel<bool, bool, true> sender(shmName);
+
+        bool validSignal = false;
+        bool incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 0; expect ready==true for cycle 1
+        incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 1; expect ready==true for cycle 2
+
+    }  // Destructor called here
+
+    // Wait for child
+    int status;
+    waitpid(pid, &status, 0);
+    EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST_F(IntegrationTest, TwoCycleReadyTrueValidFalse) {
+    // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair
+    // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation
+
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver with FIFO output to SimDummy
+        int receivedCount = 0;
+        {
+            InterprocessCommunicationChannel<bool, bool, false> receiver(shmName);
+            FIFO outputFifo(15);
+            SimDummy simDummy;
+
+            simDummy.setNextReady(true);
+            bool readySignal = outputFifo.getInputReady();
+            bool validSignal = receiver.receive_request();
+            if (validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 1 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 15) {
+                exit(3);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (simDummy.isOutputValid()) {
+                exit(1);
+            }
+
+            simDummy.setNextReady(true);
+            readySignal = outputFifo.getInputReady();  // Should be true now
+            validSignal = receiver.receive_request();
+            if (validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 2 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 15) {
+                exit(3);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (simDummy.isOutputValid()) {
+                exit(1);
+            }
+
+        }  // Destructor called here
+        exit(0);
+    }
+
+    // Parent process: Sender
+    {
+        InterprocessCommunicationChannel<bool, bool, true> sender(shmName);
+
+        bool validSignal = false;
+        bool incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 0; expect ready==true for cycle 1
+        incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 1; expect ready==true for cycle 2
+
+    }  // Destructor called here
+
+    // Wait for child
+    int status;
+    waitpid(pid, &status, 0);
+    EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST_F(IntegrationTest, TwoCycleReadyFalseValidTrue) {
+    // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair
+    // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation
+
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver with FIFO output to SimDummy
+        int receivedCount = 0;
+        {
+            InterprocessCommunicationChannel<bool, bool, false> receiver(shmName);
+            FIFO outputFifo(15);
+            SimDummy simDummy;
+
+            simDummy.setNextReady(false);
+            bool readySignal = outputFifo.getInputReady();
+            bool validSignal = receiver.receive_request();
+            if (!validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 1 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 14) {
+                exit(3);
+            }
+            if (!outputFifo.getOutputValid()) {
+                exit(4);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (!simDummy.isOutputValid()) {
+                exit(1);
+            }
+
+            simDummy.setNextReady(false);
+            readySignal = outputFifo.getInputReady();  // Should be true now (FIFO not full)
+            validSignal = receiver.receive_request();
+            if (!validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 2 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 13) {
+                exit(3);
+            }
+            if (!outputFifo.getOutputValid()) {
+                exit(4);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (!simDummy.isOutputValid()) {
+                exit(1);
+            }
+
+        }  // Destructor called here
+        exit(0);
+    }
+
+    // Parent process: Sender
+    {
+        InterprocessCommunicationChannel<bool, bool, true> sender(shmName);
+
+        bool validSignal = true;
+        bool incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 0; expect ready==true for cycle 1
+        incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 1; expect ready==true for cycle 2
+
+    }  // Destructor called here
+
+    // Wait for child
+    int status;
+    waitpid(pid, &status, 0);
+    EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+
+TEST_F(IntegrationTest, TwoCycleReadyTrueValidTrue) {
+    // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair
+    // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation
+
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver with FIFO output to SimDummy
+        int receivedCount = 0;
+        {
+            InterprocessCommunicationChannel<bool, bool, false> receiver(shmName);
+            FIFO outputFifo(15);
+            SimDummy simDummy;
+
+            simDummy.setNextReady(true);
+            bool readySignal = outputFifo.getInputReady();
+            bool validSignal = receiver.receive_request();
+            if (!validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 1 STARTS
+
+            // Verify FIFO state and SimDummy
+            if (outputFifo.getSpaceLeft() != 14) {
+                exit(3);
+            }
+            if (!outputFifo.getOutputValid()) {
+                exit(4);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (!simDummy.isOutputValid()) {
+                exit(1);
+            }
+
+            simDummy.setNextReady(true);
+            readySignal = outputFifo.getInputReady();  // Should be true now
+            validSignal = receiver.receive_request();
+            if (!validSignal) {
+                exit(2);
+            }
+            receiver.send_response(readySignal);
+            outputFifo.update(validSignal, simDummy.isInputReady());
+            outputFifo.toggleClock();  // BELOW HERE CYCLE 2 STARTS
+
+            // Verify FIFO state and SimDummy - FIFO consumes data because SimDummy is ready
+            if (outputFifo.getSpaceLeft() != 14) {
+                exit(3);
+            }
+            if (!outputFifo.getOutputValid()) {
+                exit(4);
+            }
+            simDummy.setNextValid(outputFifo.getOutputValid());
+            simDummy.toggleClock();
+            if (!simDummy.isOutputValid()) {
+                exit(1);
+            }
+
+        }  // Destructor called here
+        exit(0);
+    }
+
+    // Parent process: Sender
+    {
+        InterprocessCommunicationChannel<bool, bool, true> sender(shmName);
+
+        bool validSignal = true;
+        bool incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 0; expect ready==true for cycle 1
+        incomingReady = sender.send_request(validSignal);
+        EXPECT_TRUE(incomingReady);  // We are in cycle 1; expect ready==true for cycle 2
+
+    }  // Destructor called here
+
+    // Wait for child
+    int status;
+    waitpid(pid, &status, 0);
+    EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+
+// ===== Sender Side Integration Tests =====
+
+TEST_F(IntegrationTest, SimToFIFO) {
+    // Architecture: SimDummy -> FIFO
+
+    SimDummy sim;
+    FIFO fifo(15);
+
+    // Propagate valid through SimDummy
+    sim.setNextValid(true);
+    fifo.update(sim.isOutputValid(), false);
+    EXPECT_TRUE(fifo.getInputReady());
+    sim.setNextReady(fifo.getInputReady());
+    fifo.toggleClock();
+    sim.toggleClock();
+    EXPECT_EQ(fifo.size(), 0);
+    EXPECT_TRUE(sim.isInputReady());
+
+    // Fill FIFO to capacity
+    for (std::size_t i = 0; i < 15; ++i) {
+        sim.setNextValid(true);
+
+        fifo.update(sim.isOutputValid(), false);
+        EXPECT_TRUE(fifo.getInputReady());
+        sim.setNextReady(fifo.getInputReady());
+        EXPECT_EQ(fifo.size(), i);
+        fifo.toggleClock();
+        sim.toggleClock();
+        EXPECT_EQ(fifo.size(), i + 1);
+        EXPECT_TRUE(sim.isInputReady());
+    }
+
+    EXPECT_FALSE(fifo.getInputReady());  // FIFO changed to not ready on this cycle; Sim is still ready
+    sim.setNextValid(true);
+    fifo.update(sim.isOutputValid(), false);
+    EXPECT_FALSE(fifo.getInputReady());
+    sim.setNextReady(fifo.getInputReady());
+    fifo.toggleClock();
+    sim.toggleClock();  // Propagate ready false through sim
+
+    EXPECT_EQ(fifo.size(), 15);
+    EXPECT_FALSE(sim.isInputReady());
+}
diff --git a/finn_xsi/finn_xsi/unittests/InterprocessCommunicationChannel_test.cpp b/finn_xsi/finn_xsi/unittests/InterprocessCommunicationChannel_test.cpp
new file mode 100644
index 0000000000..8952c6e002
--- /dev/null
+++ b/finn_xsi/finn_xsi/unittests/InterprocessCommunicationChannel_test.cpp
@@ -0,0 +1,1163 @@
+#include "InterprocessCommunicationChannel.hpp"
+
+#include <gtest/gtest.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <chrono>
+#include <thread>
+
+// Simple request/response types for testing
+struct TestRequest {
+    int value;
+    bool flag;
+
+    TestRequest() : value(0), flag(false) {}
+    TestRequest(int v, bool f) : value(v), flag(f) {}
+
+    bool operator==(const TestRequest& other) const { return value == other.value && flag == other.flag; }
+};
+
+struct TestResponse {
+    int result;
+    bool success;
+
+    TestResponse() : result(0), success(false) {}
+    TestResponse(int r, bool s) : result(r), success(s) {}
+
+    bool operator==(const TestResponse& other) const { return result == other.result && success == other.success; }
+};
+
+// Test fixture for InterprocessCommunicationChannel tests
+class InterprocessCommunicationChannelTest : public ::testing::Test {
+     protected:
+    void SetUp() override {
+        // Generate unique shared memory name for each test
+        shmName = "test_ipc_" + std::to_string(getpid()) + "_" + std::to_string(std::chrono::steady_clock::now().time_since_epoch().count());
+    }
+
+    void TearDown() override {
+        // Cleanup: ensure shared memory is removed
+        boost::interprocess::shared_memory_object::remove(shmName.c_str());
+    }
+
+    std::string shmName;
+};
+
+// ===== Constructor and Initialization Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, SenderConstructorCreatesSharedMemory) {
+    InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+    // Verify that shared memory exists
+    bool shmExists = false;
+    try {
+        boost::interprocess::managed_shared_memory shmem(boost::interprocess::open_only, shmName.c_str());
+        shmExists = true;
+    } catch (...) { shmExists = false; }
+
+    EXPECT_TRUE(shmExists);
+}
+
+TEST_F(InterprocessCommunicationChannelTest, ReceiverWaitsForSenderToCreateSharedMemory) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver (waits for sender)
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        exit(0);
+    } else {
+        // Parent process: Sender (creates shared memory)
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        // Wait for child to complete
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, DefaultConstructorCreatesUninitializedObject) {
+    InterprocessCommunicationChannel<TestRequest, TestResponse, true> channel;
+    // Should not crash - object is in moved-from state
+    // Destructor should handle this gracefully
+}
+
+TEST_F(InterprocessCommunicationChannelTest, MoveConstructorTransfersOwnership) {
+    InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender1(shmName);
+    InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender2(std::move(sender1));
+
+    // sender2 should now own the shared memory
+    // sender1 should be in moved-from state (destructor shouldn't crash)
+}
+
+TEST_F(InterprocessCommunicationChannelTest, MoveAssignmentTransfersOwnership) {
+    InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender1(shmName);
+    InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender2;
+
+    sender2 = std::move(sender1);
+
+    // sender2 should now own the shared memory
+    // sender1 should be in moved-from state
+}
+
+// ===== Single Request-Response Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, SingleRequestResponseExchange) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender sends request, waits for response
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        TestRequest req(42, true);
+        TestResponse resp = sender.send_request(req);
+
+        // Verify response
+        exit((resp.result == 84 && resp.success) ? 0 : 1);
+    } else {
+        // Parent process: Receiver waits for request, sends response
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+
+        // Small delay to ensure both processes are ready
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        TestRequest req = receiver.receive_request();
+        EXPECT_EQ(req.value, 42);
+        EXPECT_TRUE(req.flag);
+
+        // Send response (double the request value)
+        TestResponse resp(req.value * 2, true);
+        receiver.send_response(resp);
+
+        // Wait for child and check result
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, RequestResponseWithDifferentValues) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        TestRequest req(100, false);
+        TestResponse resp = sender.send_request(req);
+
+        exit((resp.result == 200 && !resp.success) ? 0 : 1);
+    } else {
+        // Parent process: Receiver
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        TestRequest req = receiver.receive_request();
+        EXPECT_EQ(req.value, 100);
+        EXPECT_FALSE(req.flag);
+
+        TestResponse resp(req.value * 2, req.flag);
+        receiver.send_response(resp);
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, SingleSplitJoinRequest) {
+    // Test that a diamond pattern of communication works
+    pid_t p1 = fork();
+    pid_t p2 = fork();
+    pid_t p3 = fork();
+    std::string leftName = shmName + "_left_in";
+    std::string rightName = shmName + "_right_in";
+    std::string leftOutName = shmName + "_left_out";
+    std::string rightOutName = shmName + "_right_out";
+
+    if (p1 != 0 && p2 != 0 && p3 != 0) {
+        // Parent (origin)
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> originToLeft(leftName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> originToRight(rightName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        originToLeft.handshake();
+        originToRight.handshake();
+
+        // Send message to the left
+        TestRequest reqLeft(100, false);
+        TestResponse respLeft = originToLeft.send_request(reqLeft);
+        EXPECT_EQ(respLeft.result, 600);
+
+        // Send message to the right
+        TestRequest reqRight(130, false);
+        TestResponse respRight = originToRight.send_request(reqRight);
+        EXPECT_EQ(respRight.result, 780);
+        std::cout << "Origin done." << std::endl;
+
+    } else if (p1 == 0 && p2 != 0 && p3 != 0) {
+        // P1 (Left)
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> fromOrigin(leftName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> toEnd(leftOutName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        fromOrigin.handshake();
+        toEnd.handshake();
+
+        // Receive from origin
+        TestRequest req = fromOrigin.receive_request();
+        EXPECT_EQ(req.value, 100);
+        EXPECT_FALSE(req.flag);
+
+        // Forward triple
+        TestRequest reqForward(req.value * 3, req.flag);
+        TestResponse resp = toEnd.send_request(reqForward);
+        auto expectedResponseFromEnd = req.value * 2 * 3;
+        EXPECT_EQ(resp.result, expectedResponseFromEnd);
+
+        // Answer with value from end
+        TestResponse respOrigin(resp.result, resp.success);
+        fromOrigin.send_response(respOrigin);
+
+        std::cout << "Left done." << std::endl;
+        exit((req.value == 100 && resp.result == expectedResponseFromEnd) ? 0 : 1);
+
+    } else if (p1 != 0 && p2 == 0 && p3 != 0) {
+        // P2 (Right)
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> fromOrigin(rightName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> toEnd(rightOutName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        fromOrigin.handshake();
+        toEnd.handshake();
+
+        // Receive from origin
+        TestRequest req = fromOrigin.receive_request();
+        EXPECT_EQ(req.value, 130);
+        EXPECT_FALSE(req.flag);
+
+        // Forward triple
+        TestRequest reqForward(req.value * 3, req.flag);
+        TestResponse resp = toEnd.send_request(reqForward);
+        auto expectedResponseFromEnd = req.value * 2 * 3;
+        EXPECT_EQ(resp.result, expectedResponseFromEnd);
+
+        // Answer with value from end
+        TestResponse respOrigin(resp.result, resp.success);
+        fromOrigin.send_response(respOrigin);
+
+        std::cout << "Right done." << std::endl;
+        exit((req.value == 130 && resp.result == expectedResponseFromEnd) ? 0 : 1);
+
+    } else if (p1 != 0 && p2 != 0 && p3 == 0) {
+        // End
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> endLeft(leftOutName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> endRight(rightOutName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        endLeft.handshake();
+        endRight.handshake();
+
+        // Receive and return double
+        TestRequest reqLeft = endLeft.receive_request();
+        EXPECT_EQ(reqLeft.value, 300);
+        TestResponse respLeft(reqLeft.value * 2, true);
+        endLeft.send_response(respLeft);
+
+        // Receive and return double
+        TestRequest reqRight = endRight.receive_request();
+        EXPECT_EQ(reqRight.value, 390);
+        TestResponse respRight(reqRight.value * 2, true);
+        endRight.send_response(respRight);
+
+        std::cout << "End done." << std::endl;
+        exit((reqLeft.value == 300 && reqRight.value == 390) ? 0 : 1);
+    }
+
+    // Wait for all forks to shut down
+    if (p1 != 0 && p2 != 0 && p3 != 0) {
+        int status;
+        waitpid(p1, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+        waitpid(p2, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+        waitpid(p3, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+
+}
+
+// ===== Multiple Request-Response Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, MultipleRequestResponseSequential) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender sends multiple requests
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        for (int i = 0; i < 10; ++i) {
+            TestRequest req(i, i % 2 == 0);
+            TestResponse resp = sender.send_request(req);
+
+            // Verify response matches expected calculation
+            if (resp.result != i * 3 || resp.success != (i % 2 == 0)) {
+                exit(1);
+            }
+        }
+        exit(0);
+    } else {
+        // Parent process: Receiver processes multiple requests
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        for (int i = 0; i < 10; ++i) {
+            TestRequest req = receiver.receive_request();
+            EXPECT_EQ(req.value, i);
+            EXPECT_EQ(req.flag, i % 2 == 0);
+
+            // Send calculated response
+            TestResponse resp(req.value * 3, req.flag);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, ManyRequestResponseExchanges) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        for (int i = 0; i < 1000; ++i) {
+            TestRequest req(i % 100, i % 3 == 0);
+            TestResponse resp = sender.send_request(req);
+
+            // Verify response
+            int expected = (i % 100) + 10;
+            if (resp.result != expected) {
+                exit(1);
+            }
+        }
+        exit(0);
+    } else {
+        // Parent process: Receiver
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        for (int i = 0; i < 1000; ++i) {
+            TestRequest req = receiver.receive_request();
+
+            // Just verify exchange completes without deadlock
+            int expected_val = i % 100;
+            EXPECT_EQ(req.value, expected_val);
+
+            TestResponse resp(req.value + 10, true);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, AlternatingRequestPattern) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender with alternating pattern
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        for (int i = 0; i < 100; ++i) {
+            bool flag = (i % 2 == 0);
+            TestRequest req(i, flag);
+            TestResponse resp = sender.send_request(req);
+
+            // Verify response
+            if (resp.result != i * 2 || resp.success != flag) {
+                exit(1);
+            }
+        }
+        exit(0);
+    } else {
+        // Parent process: Receiver
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        for (int i = 0; i < 100; ++i) {
+            TestRequest req = receiver.receive_request();
+            EXPECT_EQ(req.value, i);
+            EXPECT_EQ(req.flag, i % 2 == 0);
+
+            TestResponse resp(req.value * 2, req.flag);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+// ===== Buffer Flipping Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, BufferFlipsCorrectly) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        // Perform multiple exchanges to trigger buffer flips
+        for (int i = 0; i < 20; ++i) {
+            TestRequest req(i, true);
+            TestResponse resp = sender.send_request(req);
+
+            if (resp.result != i + 1) {
+                exit(1);
+            }
+        }
+        exit(0);
+    } else {
+        // Parent process: Receiver
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        // Perform multiple exchanges - buffer should flip multiple times
+        for (int i = 0; i < 20; ++i) {
+            TestRequest req = receiver.receive_request();
+            EXPECT_EQ(req.value, i);
+
+            TestResponse resp(req.value + 1, true);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+// ===== Stress Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, HighFrequencyExchanges) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender - rapid exchanges
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        for (int i = 0; i < 10000; ++i) {
+            TestRequest req(i & 0xFF, i & 1);
+            TestResponse resp = sender.send_request(req);
+
+            if (resp.result != (i & 0xFF) * 2) {
+                exit(1);
+            }
+        }
+        exit(0);
+    } else {
+        // Parent process: Receiver - rapid exchanges
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        for (int i = 0; i < 10000; ++i) {
+            TestRequest req = receiver.receive_request();
+
+            TestResponse resp(req.value * 2, req.flag);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, StressTestWithComplexPattern) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender with complex pattern
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        for (int i = 0; i < 5000; ++i) {
+            int val = (i * 7) % 127;
+            bool flag = ((i * 11) % 13) < 6;
+            TestRequest req(val, flag);
+            TestResponse resp = sender.send_request(req);
+
+            if (resp.result != val + 5) {
+                exit(1);
+            }
+        }
+        exit(0);
+    } else {
+        // Parent process: Receiver with response calculation
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        for (int i = 0; i < 5000; ++i) {
+            TestRequest req = receiver.receive_request();
+
+            TestResponse resp(req.value + 5, req.flag);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+// ===== Reference Counting Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, ReferenceCountingTwoProcesses) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Create sender and let it go out of scope
+        {
+            InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+            TestRequest req(1, true);
+            sender.send_request(req);
+        }
+
+        // Shared memory should still exist because parent still holds reference
+        bool shmExists = false;
+        try {
+            boost::interprocess::managed_shared_memory shmem(boost::interprocess::open_only, shmName.c_str());
+            shmExists = true;
+        } catch (...) { shmExists = false; }
+
+        exit(shmExists ? 0 : 1);
+    } else {
+        // Parent process: Keep receiver alive
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        TestRequest req = receiver.receive_request();
+        TestResponse resp(req.value, req.flag);
+        receiver.send_response(resp);
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, SharedMemoryCleanupAfterBothProcessesExit) {
+    // This test verifies that shared memory is properly cleaned up
+    // when both processes exit.
+
+    pid_t verifier_pid = fork();
+
+    if (verifier_pid == 0) {
+        // Verifier process: spawns two children and then checks cleanup
+        pid_t sender_pid = fork();
+
+        if (sender_pid == 0) {
+            // First child: Sender
+            // Use block scope so destructor is called before exit
+            {
+                InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+                TestRequest req(42, true);
+                sender.send_request(req);
+            }  // Destructor called here
+            exit(0);
+        }
+
+        // Small delay to ensure sender creates shared memory
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        pid_t receiver_pid = fork();
+        if (receiver_pid == 0) {
+            // Second child: Receiver
+            // Use block scope so destructor is called before exit
+            {
+                InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+                TestRequest req = receiver.receive_request();
+                TestResponse resp(req.value, req.flag);
+                receiver.send_response(resp);
+            }  // Destructor called here
+            exit(0);
+        }
+
+        // Wait for both children to complete
+        int sender_status, receiver_status;
+        waitpid(sender_pid, &sender_status, 0);
+        waitpid(receiver_pid, &receiver_status, 0);
+
+        // Give time for cleanup to complete
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        // Verify shared memory is cleaned up
+        bool shmExists = false;
+        try {
+            boost::interprocess::managed_shared_memory shmem(boost::interprocess::open_only, shmName.c_str());
+            shmExists = true;
+        } catch (...) { shmExists = false; }
+
+        // Exit with 0 if cleanup succeeded (shmExists == false)
+        exit(shmExists ? 1 : 0);
+    } else {
+        // Parent: Wait for verifier process
+        int status;
+        waitpid(verifier_pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+// ===== Move Semantics Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, MoveConstructorMaintainsConnection) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender with move
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender1(shmName);
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender2(std::move(sender1));
+
+        TestRequest req(99, false);
+        TestResponse resp = sender2.send_request(req);
+
+        exit((resp.result == 99 && !resp.success) ? 0 : 1);
+    } else {
+        // Parent process: Receiver
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        TestRequest req = receiver.receive_request();
+        EXPECT_EQ(req.value, 99);
+        EXPECT_FALSE(req.flag);
+
+        TestResponse resp(req.value, req.flag);
+        receiver.send_response(resp);
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, MoveAssignmentMaintainsConnection) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender with move assignment
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender1(shmName);
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender2;
+        sender2 = std::move(sender1);
+
+        TestRequest req(77, true);
+        TestResponse resp = sender2.send_request(req);
+
+        exit((resp.result == 77 && resp.success) ? 0 : 1);
+    } else {
+        // Parent process: Receiver
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        TestRequest req = receiver.receive_request();
+        EXPECT_EQ(req.value, 77);
+        EXPECT_TRUE(req.flag);
+
+        TestResponse resp(req.value, req.flag);
+        receiver.send_response(resp);
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+// ===== Edge Cases =====
+
+TEST_F(InterprocessCommunicationChannelTest, FirstCallBehavior) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender - first call should not wait for buffer flip
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        auto start = std::chrono::steady_clock::now();
+        TestRequest req(1, true);
+        sender.send_request(req);
+        auto end = std::chrono::steady_clock::now();
+
+        // First call should complete quickly (not waiting for previous flip)
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+        exit(duration.count() < 100 ? 0 : 1);
+    } else {
+        // Parent process: Receiver
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        TestRequest req = receiver.receive_request();
+        TestResponse resp(req.value, req.flag);
+        receiver.send_response(resp);
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, ConsecutiveRequestsSameValue) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Send same request repeatedly
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        for (int i = 0; i < 50; ++i) {
+            TestRequest req(123, true);
+            TestResponse resp = sender.send_request(req);
+
+            if (resp.result != 123 || !resp.success) {
+                exit(1);
+            }
+        }
+        exit(0);
+    } else {
+        // Parent process: Verify same request received repeatedly
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        for (int i = 0; i < 50; ++i) {
+            TestRequest req = receiver.receive_request();
+            EXPECT_EQ(req.value, 123);
+            EXPECT_TRUE(req.flag);
+
+            TestResponse resp(req.value, req.flag);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+// ===== Timing and Synchronization Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, SynchronizationBetweenProcesses) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender - delayed start
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        for (int i = 0; i < 10; ++i) {
+            TestRequest req(i, true);
+            sender.send_request(req);
+        }
+        exit(0);
+    } else {
+        // Parent process: Receiver - starts immediately
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        // Should wait for sender to be ready
+        for (int i = 0; i < 10; ++i) {
+            TestRequest req = receiver.receive_request();
+            EXPECT_EQ(req.value, i);
+
+            TestResponse resp(req.value, req.flag);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+// ===== Custom Shared Memory Size Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, CustomSharedMemorySize) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender with larger shared memory
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true, 8192> sender(shmName);
+
+        TestRequest req(55, false);
+        TestResponse resp = sender.send_request(req);
+
+        exit((resp.result == 55) ? 0 : 1);
+    } else {
+        // Parent process: Receiver with larger shared memory
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false, 8192> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        TestRequest req = receiver.receive_request();
+        EXPECT_EQ(req.value, 55);
+
+        TestResponse resp(req.value, req.flag);
+        receiver.send_response(resp);
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+// ===== Stop Token Tests =====
+
+TEST_F(InterprocessCommunicationChannelTest, SenderCancellationViaStopToken) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender that gets cancelled while waiting for response
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        std::stop_source stop_src;
+        std::jthread canceller([&stop_src]() {
+            // Cancel after 100ms
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            stop_src.request_stop();
+        });
+
+        TestRequest req(42, true);
+        auto start = std::chrono::steady_clock::now();
+        TestResponse resp = sender.send_request(req, stop_src.get_token());
+        auto end = std::chrono::steady_clock::now();
+
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+
+        // Should return default response quickly (within 200ms, accounting for scheduling)
+        // and not wait indefinitely for the receiver that never responds
+        exit((duration.count() < 200 && resp.result == 0 && !resp.success) ? 0 : 1);
+    } else {
+        // Parent process: Sender creates shared memory but receiver never responds
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> dummy_sender(shmName);
+
+        // Wait for child to complete
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, ReceiverCancellationViaStopToken) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver that gets cancelled while waiting for request
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+
+        std::stop_source stop_src;
+        std::jthread canceller([&stop_src]() {
+            // Cancel after 100ms
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            stop_src.request_stop();
+        });
+
+        auto start = std::chrono::steady_clock::now();
+        TestRequest req = receiver.receive_request(stop_src.get_token());
+        auto end = std::chrono::steady_clock::now();
+
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+
+        // Should return default request quickly (within 200ms)
+        // and not wait indefinitely for a request that never comes
+        exit((duration.count() < 200 && req.value == 0 && !req.flag) ? 0 : 1);
+    } else {
+        // Parent process: Sender creates shared memory but never sends request
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        // Wait for child to complete
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, StopTokenDoesNotInterruptNormalOperation) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender with stop token that is never triggered
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        std::stop_source stop_src;
+
+        TestRequest req(99, true);
+        TestResponse resp = sender.send_request(req, stop_src.get_token());
+
+        // Should complete normally and receive proper response
+        exit((resp.result == 198 && resp.success) ? 0 : 1);
+    } else {
+        // Parent process: Receiver responds normally
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        std::stop_source stop_src;
+        TestRequest req = receiver.receive_request(stop_src.get_token());
+        EXPECT_EQ(req.value, 99);
+        EXPECT_TRUE(req.flag);
+
+        TestResponse resp(req.value * 2, req.flag);
+        receiver.send_response(resp);
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, MultipleExchangesWithStopToken) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender performs multiple exchanges with stop token
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        std::stop_source stop_src;
+
+        for (int i = 0; i < 50; ++i) {
+            TestRequest req(i, i % 2 == 0);
+            TestResponse resp = sender.send_request(req, stop_src.get_token());
+
+            if (resp.result != i * 2 || resp.success != (i % 2 == 0)) {
+                exit(1);
+            }
+        }
+        exit(0);
+    } else {
+        // Parent process: Receiver with stop token
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        std::stop_source stop_src;
+
+        for (int i = 0; i < 50; ++i) {
+            TestRequest req = receiver.receive_request(stop_src.get_token());
+            EXPECT_EQ(req.value, i);
+
+            TestResponse resp(req.value * 2, req.flag);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, SenderCancellationMidExchange) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender that gets cancelled after some exchanges
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        std::stop_source stop_src;
+
+        // Perform a few successful exchanges
+        for (int i = 0; i < 5; ++i) {
+            TestRequest req(i, true);
+            TestResponse resp = sender.send_request(req, stop_src.get_token());
+
+            if (resp.result != i * 2) {
+                exit(1);
+            }
+        }
+
+        // Now trigger cancellation for next exchange
+        std::jthread canceller([&stop_src]() {
+            std::this_thread::sleep_for(std::chrono::milliseconds(50));
+            stop_src.request_stop();
+        });
+
+        // This should be cancelled (receiver won't respond in time)
+        TestRequest req(100, false);
+        auto start = std::chrono::steady_clock::now();
+        TestResponse resp = sender.send_request(req, stop_src.get_token());
+        auto end = std::chrono::steady_clock::now();
+
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+
+        // Should return default response due to cancellation
+        exit((duration.count() < 200 && resp.result == 0) ? 0 : 1);
+    } else {
+        // Parent process: Receiver responds to first 5 requests, then delays
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        std::stop_source stop_src;
+
+        // Respond to first 5 requests normally
+        for (int i = 0; i < 5; ++i) {
+            TestRequest req = receiver.receive_request(stop_src.get_token());
+            TestResponse resp(req.value * 2, req.flag);
+            receiver.send_response(resp);
+        }
+
+        // Delay before processing the 6th request (which will be cancelled)
+        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+
+        // Try to receive next request (might be cancelled)
+        TestRequest req = receiver.receive_request(stop_src.get_token());
+        if (req.value == 100) {
+            TestResponse resp(req.value * 2, req.flag);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, ReceiverCancellationMidExchange) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Receiver that gets cancelled after some exchanges
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+
+        std::stop_source stop_src;
+
+        // Perform a few successful exchanges
+        for (int i = 0; i < 5; ++i) {
+            TestRequest req = receiver.receive_request(stop_src.get_token());
+
+            if (req.value != i) {
+                exit(1);
+            }
+
+            TestResponse resp(req.value * 2, req.flag);
+            receiver.send_response(resp);
+        }
+
+        // Now trigger cancellation for next receive
+        std::jthread canceller([&stop_src]() {
+            std::this_thread::sleep_for(std::chrono::milliseconds(50));
+            stop_src.request_stop();
+        });
+
+        // This should be cancelled (sender will delay)
+        auto start = std::chrono::steady_clock::now();
+        TestRequest req = receiver.receive_request(stop_src.get_token());
+        auto end = std::chrono::steady_clock::now();
+
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+
+        // Should return default request due to cancellation
+        exit((duration.count() < 200 && req.value == 0) ? 0 : 1);
+    } else {
+        // Parent process: Sender sends first 5 requests, then delays
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        std::stop_source stop_src;
+
+        // Send first 5 requests normally
+        for (int i = 0; i < 5; ++i) {
+            TestRequest req(i, true);
+            TestResponse resp = sender.send_request(req, stop_src.get_token());
+
+            if (resp.result != i * 2) {
+                // Unexpected response
+                break;
+            }
+        }
+
+        // Delay before sending the 6th request (receiver will be cancelled)
+        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, ImmediateCancellation) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender with immediately stopped token
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        std::stop_source stop_src;
+        stop_src.request_stop();  // Stop immediately
+
+        TestRequest req(42, true);
+        auto start = std::chrono::steady_clock::now();
+        TestResponse resp = sender.send_request(req, stop_src.get_token());
+        auto end = std::chrono::steady_clock::now();
+
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+
+        // Should return immediately with default response
+        exit((duration.count() < 50 && resp.result == 0 && !resp.success) ? 0 : 1);
+    } else {
+        // Parent process: Just creates sender
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> dummy_sender(shmName);
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+TEST_F(InterprocessCommunicationChannelTest, StopTokenWithHighFrequencyExchanges) {
+    pid_t pid = fork();
+
+    if (pid == 0) {
+        // Child process: Sender with many rapid exchanges using stop token
+        InterprocessCommunicationChannel<TestRequest, TestResponse, true> sender(shmName);
+
+        std::stop_source stop_src;
+
+        for (int i = 0; i < 100; ++i) {
+            TestRequest req(i % 10, i % 2 == 0);
+            TestResponse resp = sender.send_request(req, stop_src.get_token());
+
+            if (resp.result != (i % 10) * 3) {
+                exit(1);
+            }
+        }
+        exit(0);
+    } else {
+        // Parent process: Receiver with stop token
+        InterprocessCommunicationChannel<TestRequest, TestResponse, false> receiver(shmName);
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+        std::stop_source stop_src;
+
+        for (int i = 0; i < 100; ++i) {
+            TestRequest req = receiver.receive_request(stop_src.get_token());
+
+            TestResponse resp(req.value * 3, req.flag);
+            receiver.send_response(resp);
+        }
+
+        int status;
+        waitpid(pid, &status, 0);
+        EXPECT_EQ(WEXITSTATUS(status), 0);
+    }
+}
+
+// Main function to run all tests
+int main(int argc, char** argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/finn_xsi/finn_xsi/xsi_bind.cpp b/finn_xsi/finn_xsi/xsi_bind.cpp
index 6530c84358..1edf80b01b 100644
--- a/finn_xsi/finn_xsi/xsi_bind.cpp
+++ b/finn_xsi/finn_xsi/xsi_bind.cpp
@@ -8,8 +8,11 @@
  * @author	Thomas B. Preußer <thomas.preusser@amd.com>
  ***************************************************************************/
 
+#include <Port.h>
+#include <Design.h>
+#include <Kernel.h>
+
 #include <pybind11/pybind11.h>
-#include "xsi_finn.hpp"
 #include <mutex>
 #include <map>
 
@@ -31,11 +34,6 @@ namespace {
 
 PYBIND11_MODULE(xsi, m) {
 
-	py::class_<Kernel, std::shared_ptr<Kernel>>(m, "Kernel")
-		.def(py::init<std::string const&>())
-		.def("hex_in_lower", &Kernel::hex_in_lower)
-		.def("hex_in_upper", &Kernel::hex_in_upper);
-
 	py::class_<Design, std::unique_ptr<Design, DesignDeleter>>(m, "Design")
 		.def(py::init([](
 			std::shared_ptr<Kernel> const &kernel,
@@ -54,7 +52,7 @@ PYBIND11_MODULE(xsi, m) {
 		.def("get_status",     &Design::get_status)
 		.def("get_error_info", &Design::get_error_info)
 		.def("num_ports",      &Design::num_ports)
-		.def("getPort",        static_cast<Port* (Design::*)(std::string const&)>(&Design::getPort))
+		.def("getPort",        static_cast<Port& (Design::*)(std::string const&)>(&Design::getPort))
 		.def("ports", [](Design &d) {
 			auto const  e = d.ports();
 			return  py::make_iterator(e.begin(), e.end());
diff --git a/finn_xsi/finn_xsi/xsi_finn.cpp b/finn_xsi/finn_xsi/xsi_finn.cpp
deleted file mode 100644
index 19134ac988..0000000000
--- a/finn_xsi/finn_xsi/xsi_finn.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2025, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * @brief	FINN XSI++: C++ XSI Binding used by FINN.
- * @author	Thomas B. Preußer <thomas.preusser@amd.com>
- ***************************************************************************/
-
-#include "xsi_finn.hpp"
-
-#include <iostream>
-#include <algorithm>
-
-
-using namespace xsi;
-
-//===========================================================================
-// Local Helpers
-
-namespace {
-	void* resolve_or_throw(SharedLibrary &lib, char const *const  sym) {
-		auto const  res = lib.getsymbol(sym);
-		if(!res) {
-			throw  std::runtime_error(
-				std::string("Failed to resolve ")
-				.append(sym).append(" in ").append(lib.path())
-			);
-		}
-		return *res;
-	}
-	char  XZ10[4] = { '0', '1', 'Z', 'X' };
-	char  HEX[16] = {
-		'0', '1', '2', '3', '4', '5', '6', '7',
-		'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
-	};
-}
-
-void Kernel::hex_in_lower() {
-	for(unsigned  i =  2; i <  4; i++)  XZ10[i] |=  ' ';
-	for(unsigned  i = 10; i < 16; i++)  HEX [i] |=  ' ';
-}
-void Kernel::hex_in_upper() {
-	for(unsigned  i =  2; i <  4; i++)  XZ10[i] &= ~' ';
-	for(unsigned  i = 10; i < 16; i++)  HEX [i] &= ~' ';
-}
-
-//===========================================================================
-// Shared Library Representation
-
-char const SharedLibrary::library_suffix[] =
-#if defined(_WIN32)
-	".lib";
-#else
-	".so";
-#endif
-
-#if defined(_WIN32)
-namespace {
-	std::string translate_error_message(DWORD  errid) {
-		std::string  msg;
-		LPTSTR  bufptr;
-		FormatMessage(
-			FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-			nullptr,
-			errid,
-			MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
-			&bufptr,
-			0, nullptr
-		);
-		if(bufptr)  msg = reinterpret_cast<char*>(bufptr);
-		LocalFree(bufptr);
-		return  msg;
-	}
-}
-#endif
-
-SharedLibrary& SharedLibrary::open(std::string const &path) {
-	if(_lib)  throw  std::runtime_error("SharedLibrary still open for " + _path);
-	_lib  = load(path);
-	_path = path;
-	return *this;
-}
-
-SharedLibrary::handle_type SharedLibrary::load(std::string const &path) {
-	if(path.empty())  throw  std::domain_error("Empty library path.");
-
-#if defined(_WIN32)
-	SetLastError(0);
-#ifdef UNICODE
-	// Use LoadLibraryA explicitly on windows if UNICODE is defined
-	handle_type const  lib = LoadLibraryA(path.c_str());
-#else
-	handle_type const  lib = LoadLibrary(path.c_str());
-#endif
-	if(!lib)  throw  std::runtime_error(translate_error_message(GetLastError()));
-#else
-	handle_type const  lib = dlopen(path.c_str(), RTLD_LAZY | RTLD_GLOBAL);
-	if(!lib)  throw  std::runtime_error(dlerror());
-#endif
-	return  lib;
-}
-
-void SharedLibrary::unload() {
-	if(_lib) {
-#if defined(_WIN32)
-		FreeLibrary(_lib);
-#else
-		dlclose(_lib);
-#endif
-	}
-}
-
-std::optional<void*> SharedLibrary::getsymbol(char const *const  name) {
-	void *sym;
-#if defined(_WIN32)
-	sym = (void*)GetProcAddress(_lib, name);
-	if(!sym)
-#else
-	dlerror(); // clear error
-	sym = dlsym(_lib, name);
-	char const *const  err = dlerror();
-	if(err)
-#endif
-		return  std::nullopt;
-	return  std::make_optional(sym);
-}
-
-//===========================================================================
-// xsi::Kernel
-
-char const *const  Kernel::Xsi::FUNC_NAMES[EXTENT] = {
-	"xsi_get_value", "xsi_put_value",
-	"xsi_get_int_port", "xsi_get_str_port",
-
-	"xsi_get_int", "xsi_get_port_number",
-
-	"xsi_trace_all", "xsi_run", "xsi_restart",
-	"xsi_get_status", "xsi_get_error_info",
-
-	"xsi_close"
-};
-
-#include <iostream>
-inline Kernel::Xsi::Xsi(SharedLibrary &lib) : _hdl(nullptr) {
-	// Resolve XSI Functions
-	for(unsigned  i = 0; i < EXTENT; i++) {
-		_func[i] = resolve_or_throw(lib, FUNC_NAMES[i]);
-	}
-}
-
-//---------------------------------------------------------------------------
-// Life Cycle
-Kernel::Kernel(std::string const &kernel_lib) : _kernel_lib(kernel_lib), _xsi(_kernel_lib) {}
-
-Kernel::~Kernel() {
-	if(_design_lib)  std::cerr << "Disposing XSI Kernel with open Design." << std::endl;
-}
-
-void Kernel::open(std::string const &design_lib, s_xsi_setup_info const &setup_info) {
-	_design_lib.open(design_lib);
-	try {
-		auto      const  f   = t_fp_xsi_open(resolve_or_throw(_design_lib, "xsi_open"));
-		xsiHandle const  hdl = f(const_cast<p_xsi_setup_info>(&setup_info));
-		if(!hdl)  throw  std::runtime_error("Loading of design failed");
-		_xsi.setHandle(hdl);
-
-		// Enumerate Ports
-		unsigned const          port_count = xsi<Xsi::get_int>(xsiNumTopPorts);
-		std::unique_ptr<Port[]> ports { new Port[port_count] };
-		for(unsigned  i = 0; i < port_count; i++)  new(&ports[i]) Port(*this, i);
-		_port_count = port_count;
-		_ports = std::move(ports);
-	}
-	catch(...) {
-		_design_lib.close();
-		throw;
-	}
-}
-void Kernel::close() noexcept {
-	xsi<Xsi::close>();
-	_xsi.setHandle(nullptr);
-	_design_lib.close();
-	_ports.reset();
-
-	// Clean up Library State
-	std::optional<void*> const  vptr = _kernel_lib.getsymbol("svTypeInfo");
-	if(vptr) *((void**)*vptr) = nullptr;
-}
-
-//===========================================================================
-// xsi::Port
-
-bool Port::hasUnknown() const {
-	unsigned                   const  n = (width()+31) / 32;
-	s_xsi_vlog_logicval const *const  p = buf();
-	for(unsigned  i = 0; i < n; i++) {
-		if(p[i].bVal)  return  true;
-	}
-	return  false;
-}
-
-bool Port::isZero() const {
-	unsigned                   const  n = (width()+31) / 32;
-	s_xsi_vlog_logicval const *const  p = buf();
-	for(unsigned  i = 0; i < n; i++) {
-		if(p[i].aVal)  return  false;
-	}
-	return  true;
-}
-
-std::string Port::as_binstr() const {
-	unsigned const  w = width();
-	std::string  res(w, '?');
-
-	s_xsi_vlog_logicval const *si = buf();
-	std::string::iterator      di = res.end();
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-	uint32_t  a;
-	uint32_t  b;
-	for(unsigned  i = 0; i < w; i++) {
-		if((i & 31) == 0) {
-			a = si->aVal;
-			b = si->bVal;
-			si++;
-		}
-		*--di = XZ10[((b&1)<<1)|(a&1)];
-		a >>= 1;
-		b >>= 1;
-	}
-#pragma GCC diagnostic pop
-	return  res;
-}
-
-std::string Port::as_hexstr() const {
-	unsigned  l = (width()+3)/4;
-	std::string  res(l, '?');
-	s_xsi_vlog_logicval const *si = buf();
-	std::string::iterator      di = res.end();
-
-	while(l > 0) {
-		uint32_t  a = si->aVal;
-		uint32_t  b = si->bVal;
-		si++;
-
-		unsigned  m = std::min(8u, l);
-		l -= m;
-		do {
-			unsigned const  bm = b & 0xF;
-			unsigned const  am = a & 0xF;
-
-			*--di = !bm? HEX[am] : XZ10[3 - !(am&bm)];
-			a >>= 4;
-			b >>= 4;
-		}
-		while(--m > 0);
-	}
-	return  res;
-}
-
-Port& Port::clear() {
-	unsigned             const  n = (width()+31) / 32;
-	s_xsi_vlog_logicval *const  p = buf();
-	std::fill(p, p+n, s_xsi_vlog_logicval { .aVal = 0u, .bVal = 0u });
-	return *this;
-}
-
-Port& Port::set_binstr(std::string const &val) {
-	std::string::const_iterator  si = val.end();
-	s_xsi_vlog_logicval         *di = buf();
-
-	unsigned const  n = (width()+31) / 32;
-	unsigned  l = val.length();
-	for(unsigned  i = 0; i < n; i++) {
-		uint32_t  a = 0;
-		uint32_t  b = 0;
-
-		unsigned const  m = std::min(32u, l);
-		l  -= m;
-		si -= m;
-		for(unsigned  j = 0; j < m; j++) {
-			a <<= 1;
-			b <<= 1;
-			switch(*si++) {
-			case '1':
-				a |= 1;
-			case '0':
-				continue;
-
-			default:
-				a |= 1;
-			case 'Z':
-			case 'z':
-				b |= 1;
-				continue;
-			}
-		}
-		si -= m;
-
-		di->aVal = a;
-		di->bVal = b;
-		di++;
-	}
-
-	return *this;
-}
-
-Port& Port::set_hexstr(std::string const &val) {
-	std::string::const_iterator  si = val.end();
-	s_xsi_vlog_logicval         *di = buf();
-
-	unsigned const  n = (width()+31) / 32;
-	unsigned  l = val.length();
-	for(unsigned  i = 0; i < n; i++) {
-		uint32_t  a = 0;
-		uint32_t  b = 0;
-
-		unsigned const  m = std::min(8u, l);
-		l  -= m;
-		si -= m;
-		for(unsigned  j = 0; j < m; j++) {
-			char  c = *si++;
-			a <<= 4;
-			b <<= 4;
-
-			if(('0' <= c) && c <= '9')  a |= c & 0xF;
-			else {
-				c |= 0x20;
-				if(('a' <= c) && (c <= 'f'))  a |= c - ('a'-10);
-				else {
-					b |= 0xF;
-					if(c != 'z')  a |= 0xF;
-				}
-			}
-		}
-		si -= m;
-
-		di->aVal = a;
-		di->bVal = b;
-		di++;
-	}
-
-	return *this;
-}
diff --git a/finn_xsi/finn_xsi/xsi_finn.hpp b/finn_xsi/finn_xsi/xsi_finn.hpp
deleted file mode 100644
index 4268657aef..0000000000
--- a/finn_xsi/finn_xsi/xsi_finn.hpp
+++ /dev/null
@@ -1,356 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2025, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * @brief	FINN XSI++: C++ XSI Binding used by FINN.
- * @author	Thomas B. Preußer <thomas.preusser@amd.com>
- ***************************************************************************/
-#ifndef XSI_FINN_HPP
-#define XSI_FINN_HPP
-
-#include <memory>
-#include <optional>
-#include <tuple>
-#include <string>
-#include <cstring>
-
-#include <stdexcept>
-#include <exception>
-
-#if defined(_WIN32)
-#	include <windows.h>
-#else
-#	include <dlfcn.h>
-#endif
-
-#include "xsi.h"
-
-
-namespace xsi {
-
-//===========================================================================
-// Shared Library Representation
-
-class SharedLibrary {
-public:
-	static char const  library_suffix[];
-
-private:
-	using  handle_type =
-#if defined(_WIN32)
-		HINSTANCE;
-#else
-		void*;
-#endif
-
-	//-----------------------------------------------------------------------
-	// Instance State
-private:
-	handle_type _lib;
-	std::string _path;
-
-	//-----------------------------------------------------------------------
-	// Life Cycle
-public:
-	SharedLibrary() : _lib(nullptr), _path() {}
-	SharedLibrary(std::string const &path) : _lib(load(path)), _path(path) {}
-	~SharedLibrary() { unload(); }
-
-private:
-	SharedLibrary(SharedLibrary const&) = delete;
-	SharedLibrary& operator=(SharedLibrary const&) = delete;
-
-public:
-	operator bool() const { return  bool(_lib); }
-	SharedLibrary& open(std::string const &path);
-	SharedLibrary& close() {
-		unload();
-		_lib = nullptr;
-		_path.clear();
-		return *this;
-	}
-
-private:
-	static handle_type load(std::string const &path);
-	void unload();
-
-	//-----------------------------------------------------------------------
-	// Accessors
-public:
-	std::string const& path() const { return _path; }
-	std::optional<void*> getsymbol(char const *const  name);
-
-}; // class SharedLibrary
-
-//===========================================================================
-// xsi::Kernel
-
-template<typename It>
-class enumerator {
-	It _begin;
-	It _end;
-public:
-	enumerator(It  begin, It  end) : _begin(begin), _end(end) {}
-	~enumerator() {}
-public:
-	It begin() const { return _begin; }
-	It end()   const { return _end; }
-};
-
-class Design;
-class Port;
-class Kernel {
-
-	//-----------------------------------------------------------------------
-	// Dispatch Table for XSI Functions
-	class Xsi {
-		//- Statics ---------------------
-	public:
-		// Function Indeces
-		static constexpr unsigned
-			get_value = 0, put_value = 1,
-			get_int_port = 2, get_str_port = 3,
-
-			get_int = 4, get_port_number = 5,
-
-			trace_all = 6, run = 7, restart = 8,
-			get_status = 9, get_error_info = 10,
-
-			close = 11;
-
-	private:
-		// Function Names & Types
-		static constexpr unsigned  EXTENT = 12;
-		static char const *const  FUNC_NAMES[EXTENT];
-		using  type_map = std::tuple<
-			// Port Access
-			t_fp_xsi_get_value, t_fp_xsi_put_value,
-			t_fp_xsi_get_int_port, t_fp_xsi_get_str_port,
-
-			// Design Inspection
-			t_fp_xsi_get_int, t_fp_xsi_get_port_number,
-
-			// Simulation Control & Status
-			t_fp_xsi_trace_all, t_fp_xsi_run, t_fp_xsi_restart,
-			t_fp_xsi_get_status, t_fp_xsi_get_error_info,
-
-			// Closing
-			t_fp_xsi_close
-		>;
-
-		//- Actual Contents -------------
-	private:
-		xsiHandle _hdl;
-		void*     _func[EXTENT];
-
-		//- Lifecycle: in-place structure inside Kernel only
-	public:
-		Xsi(SharedLibrary &lib);
-		~Xsi() {}
-	private:
-		Xsi(Xsi const&) = delete;
-		Xsi& operator=(Xsi const&) = delete;
-
-		//- Handle Update ---------------
-	public:
-		void setHandle(xsiHandle  hdl) { _hdl = hdl; }
-
-		//- XSI Function Invocation -----
-	public:
-		template<unsigned  FID, typename... Args>
-		auto invoke(Args&&... args) const {
-			auto const  f = decltype(std::get<FID>(type_map()))(_func[FID]);
-			return  (*f)(_hdl, std::forward<Args>(args)...);
-		}
-
-	}; // class Xsi
-
-private:
-	// Instance State
-	SharedLibrary _kernel_lib;	// Backing Kernel Library
-	Xsi           _xsi;       	// XSI Dispatch Table
-
-	// Optional State once a Design in open
-	SharedLibrary           _design_lib;
-	unsigned                _port_count;
-	std::unique_ptr<Port[]> _ports;
-
-public:
-	Kernel(std::string const &kernel_lib);
-	Kernel(Kernel const&) = delete;
-	Kernel& operator=(Kernel const&) = delete;
-	~Kernel();
-
-	// Interface reserved for forwarded access through open Design
-private:
-	friend Design;
-	friend Port;
-	template<unsigned  FID, typename... Args>
-	auto xsi(Args&&... args) const {
-		return _xsi.invoke<FID>(std::forward<Args>(args)...);
-	}
-
-	// Port Accessors inlined below and public through Design
-	Port*       getPort(char const *const  name);
-	Port const* getPort(char const *const  name) const;
-	enumerator<Port*>       ports();
-	enumerator<Port const*> ports() const;
-
-	// Design con- & destruction hooks
-	void open(std::string const &design_lib, s_xsi_setup_info const &setup_info);
-	void close() noexcept;
-
-public:
-	// Hex printing manipulation
-	static void hex_in_lower();
-	static void hex_in_upper();
-
-}; // class Kernel
-
-//===========================================================================
-// xsi::Design
-
-//	- non-copyable, non-movable handle for exposing simulation control.
-class Design {
-	using  Xsi = Kernel::Xsi;
-	Kernel &_kernel;
-
-public:
-	Design(
-		Kernel &kernel,
-		std::string const &design_lib,
-		s_xsi_setup_info const &setup_info
-	) : _kernel(kernel) { kernel.open(design_lib, setup_info); }
-	Design(
-		Kernel &kernel, std::string const &design_lib,
-		char const *const  log_file = nullptr,
-		char const *const  wdb_file = nullptr
-	) : Design(kernel, design_lib, s_xsi_setup_info {
-		.logFileName = const_cast<char*>(log_file),
-		.wdbFileName = const_cast<char*>(wdb_file)
-	}) {}
-	~Design() { _kernel.close(); }
-
-private:
-	Design(Design const&) = delete;
-	Design& operator*(Design const&) = delete;
-
-	//-----------------------------------------------------------------------
-	// Forwarded Access to Open Simulation
-
-	// Simulation Control & Status
-public:
-	void trace_all()                { _kernel.xsi<Xsi::trace_all>(); }
-	void run(XSI_INT64 const  step) { _kernel.xsi<Xsi::run>(step); }
-	void restart()                  { _kernel.xsi<Xsi::restart>(); }
-
-	int         get_status()     const { return _kernel.xsi<Xsi::get_status>(); }
-	char const* get_error_info() const { return _kernel.xsi<Xsi::get_error_info>(); }
-
-	// Port Access
-public:
-	int num_ports() const { return _kernel._port_count; }
-
-	Port*       getPort(std::string const &name)       { return _kernel.getPort(name.c_str()); }
-	Port const* getPort(std::string const &name) const { return _kernel.getPort(name.c_str()); }
-
-	enumerator<Port*>       ports()       { return _kernel.ports(); }
-	enumerator<Port const*> ports() const { return const_cast<Kernel const&>(_kernel).ports(); }
-
-}; // class Design
-
-//===========================================================================
-// xsi::Port
-
-// Only exists within controlled environment within Kernel with open Design.
-class Port {
-	using  Xsi = Kernel::Xsi;
-	Kernel         &_kernel;
-	unsigned const  _id;
-	std::unique_ptr<s_xsi_vlog_logicval[]> const _buf;
-
-private:
-	// Con- and destruction under full control of Kernel
-	friend class Kernel;
-	Port() : _kernel(*static_cast<Kernel*>(nullptr)), _id(0), _buf() {}
-	Port(Kernel &kernel, unsigned const  id)
-		: _kernel(kernel), _id(id),
-		_buf(std::make_unique<s_xsi_vlog_logicval[]>((width()+31)/32)) {}
-	Port(Port const&) = delete;
-	Port& operator=(Port const&) = delete;
-public:
-	~Port() {}
-
-public:
-	char const* name()  const { return _kernel.xsi<Xsi::get_str_port>(_id, xsiNameTopPort); }
-	int         dir()   const { return _kernel.xsi<Xsi::get_int_port>(_id, xsiDirectionTopPort); }
-	unsigned    width() const { return _kernel.xsi<Xsi::get_int_port>(_id, xsiHDLValueSize); }
-
-	bool isInput()  const { return  dir() == xsiInputPort; }
-	bool isOutput() const { return  dir() == xsiOutputPort; }
-	bool isInout()  const { return  dir() == xsiInoutPort; }
-
-private:
-	s_xsi_vlog_logicval*       buf()       { return _buf.get(); }
-	s_xsi_vlog_logicval const* buf() const { return _buf.get(); }
-
-public:
-	// Buffer Synchronization
-	Port& read() {
-		_kernel.xsi<Xsi::get_value>(_id, buf());
-		return *this;
-	}
-	void write_back() {
-		_kernel.xsi<Xsi::put_value>(_id, buf());
-	}
-
-	// Inspection
-	bool hasUnknown() const;
-	bool isZero() const;
-	bool operator[](unsigned const  idx) const {
-		return (buf()[idx/32].aVal >> (idx%32)) & 1;
-	}
-
-	bool     as_bool()     const { return  buf()->aVal & 1; }
-	unsigned as_unsigned() const { return  buf()->aVal; }
-	std::string as_binstr() const;
-	std::string as_hexstr() const;
-
-	// Manipulation
-	Port& clear();
-	Port& set(unsigned  val) {
-		s_xsi_vlog_logicval *const  p = buf();
-		p->aVal = val;
-		p->bVal =   0;
-		return *this;
-	}
-	Port& set_binstr(std::string const &val);
-	Port& set_hexstr(std::string const &val);
-
-}; // class Port
-
-// Inlined Kernel Port Accessors
-
-inline Port* Kernel::getPort(char const *const  name) {
-	int const  id = xsi<Xsi::get_port_number>(name);
-	return  (id == -1)? nullptr : &_ports[id];
-}
-inline Port const* Kernel::getPort(char const *const  name) const {
-	int const  id = xsi<Xsi::get_port_number>(name);
-	return  (id == -1)? nullptr : &_ports[id];
-}
-
-inline enumerator<Port*> Kernel::ports() {
-	Port *const  beg = _ports.get();
-	return { beg, beg + _port_count };
-}
-inline enumerator<Port const*> Kernel::ports() const {
-	Port const *const  beg = _ports.get();
-	return { beg, beg + _port_count };
-}
-
-} // namespace xsi
-
-#endif
diff --git a/finn_xsi/testcase/StreamingEltwise_hls_0.v b/finn_xsi/testcase/StreamingEltwise_hls_0.v
deleted file mode 100644
index f5207e0548..0000000000
--- a/finn_xsi/testcase/StreamingEltwise_hls_0.v
+++ /dev/null
@@ -1,349 +0,0 @@
-// ==============================================================
-// Generated by Vitis HLS v2024.2
-// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
-// Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
-// ==============================================================
-
-`timescale 1 ns / 1 ps
-
-(* CORE_GENERATION_INFO="StreamingEltwise_hls_0_StreamingEltwise_hls_0,hls_ip_2024_2,{HLS_INPUT_TYPE=cxx,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xc7z020-clg400-1,HLS_INPUT_CLOCK=5.000000,HLS_INPUT_ARCH=others,HLS_SYN_CLOCK=4.826000,HLS_SYN_LAT=10,HLS_SYN_TPT=none,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=7,HLS_SYN_LUT=101,HLS_VERSION=2024_2}" *)
-
-module StreamingEltwise_hls_0 (
-        ap_clk,
-        ap_rst_n,
-        in0_V_TVALID,
-        in1_V_TVALID,
-        out_V_TREADY,
-        in0_V_TDATA,
-        in0_V_TREADY,
-        in1_V_TDATA,
-        in1_V_TREADY,
-        out_V_TDATA,
-        out_V_TVALID
-);
-
-parameter    ap_ST_iter0_fsm_state1 = 1'd1;
-parameter    ap_ST_iter1_fsm_state2 = 2'd2;
-parameter    ap_ST_iter1_fsm_state0 = 2'd1;
-
-input   ap_clk;
-input   ap_rst_n;
-input   in0_V_TVALID;
-input   in1_V_TVALID;
-input   out_V_TREADY;
-input  [7:0] in0_V_TDATA;
-output   in0_V_TREADY;
-input  [7:0] in1_V_TDATA;
-output   in1_V_TREADY;
-output  [15:0] out_V_TDATA;
-output   out_V_TVALID;
-
- reg    ap_rst_n_inv;
-reg   [0:0] ap_CS_iter0_fsm;
-wire    ap_CS_iter0_fsm_state1;
-reg    ap_block_state1_pp0_stage0_iter0;
-reg   [1:0] ap_CS_iter1_fsm;
-wire    regslice_both_out_V_U_apdone_blk;
-reg    ap_block_state2_pp0_stage0_iter1;
-wire    ap_CS_iter1_fsm_state2;
-wire   [0:0] icmp_ln82_fu_110_p2;
-reg    ap_condition_exit_pp0_iter0_stage0;
-reg    ap_ready_int;
-reg    in0_V_TDATA_blk_n;
-reg    in1_V_TDATA_blk_n;
-reg    out_V_TDATA_blk_n;
-reg   [0:0] icmp_ln82_reg_133;
-wire   [0:0] icmp_ln82_reg_133_pp0_iter0_reg;
-reg   [2:0] i1_fu_50;
-wire   [2:0] i_fu_104_p2;
-wire    ap_loop_init;
-reg   [2:0] ap_sig_allocacmp_i1_load;
-wire   [3:0] in0_slice_channels_fu_81_p1;
-wire   [8:0] zext_ln20_fu_85_p1;
-wire   [8:0] zext_ln20_1_fu_89_p1;
-wire   [8:0] outElem_fu_93_p2;
-reg   [0:0] ap_NS_iter0_fsm;
-reg   [1:0] ap_NS_iter1_fsm;
-reg    ap_ST_iter0_fsm_state1_blk;
-reg    ap_ST_iter1_fsm_state2_blk;
-wire    ap_start_int;
-wire    ap_ready_sig;
-wire    ap_done_sig;
-wire    ap_continue_int;
-wire    regslice_both_in0_V_U_apdone_blk;
-wire   [7:0] in0_V_TDATA_int_regslice;
-wire    in0_V_TVALID_int_regslice;
-reg    in0_V_TREADY_int_regslice;
-wire    regslice_both_in0_V_U_ack_in;
-wire    regslice_both_in1_V_U_apdone_blk;
-wire   [7:0] in1_V_TDATA_int_regslice;
-wire    in1_V_TVALID_int_regslice;
-reg    in1_V_TREADY_int_regslice;
-wire    regslice_both_in1_V_U_ack_in;
-wire   [15:0] out_V_TDATA_int_regslice;
-reg    out_V_TVALID_int_regslice;
-wire    out_V_TREADY_int_regslice;
-wire    regslice_both_out_V_U_vld_out;
-reg    ap_condition_50;
-wire    ap_ce_reg;
-
-// power-on initialization
-initial begin
-#0 ap_CS_iter0_fsm = 1'd1;
-#0 ap_CS_iter1_fsm = 2'd1;
-#0 i1_fu_50 = 3'd0;
-end
-
-StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont flow_control_loop_pipe_no_ap_cont_U(
-    .ap_clk(ap_clk),
-    .ap_rst(ap_rst_n_inv),
-    .ap_start(1'b1),
-    .ap_ready(ap_ready_sig),
-    .ap_done(ap_done_sig),
-    .ap_start_int(ap_start_int),
-    .ap_loop_init(ap_loop_init),
-    .ap_ready_int(ap_ready_int),
-    .ap_loop_exit_ready(ap_condition_exit_pp0_iter0_stage0),
-    .ap_loop_exit_done(1'b0),
-    .ap_continue_int(ap_continue_int),
-    .ap_done_int(1'b0)
-);
-
-StreamingEltwise_hls_0_regslice_both #(
-    .DataWidth( 8 ))
-regslice_both_in0_V_U(
-    .ap_clk(ap_clk),
-    .ap_rst(ap_rst_n_inv),
-    .data_in(in0_V_TDATA),
-    .vld_in(in0_V_TVALID),
-    .ack_in(regslice_both_in0_V_U_ack_in),
-    .data_out(in0_V_TDATA_int_regslice),
-    .vld_out(in0_V_TVALID_int_regslice),
-    .ack_out(in0_V_TREADY_int_regslice),
-    .apdone_blk(regslice_both_in0_V_U_apdone_blk)
-);
-
-StreamingEltwise_hls_0_regslice_both #(
-    .DataWidth( 8 ))
-regslice_both_in1_V_U(
-    .ap_clk(ap_clk),
-    .ap_rst(ap_rst_n_inv),
-    .data_in(in1_V_TDATA),
-    .vld_in(in1_V_TVALID),
-    .ack_in(regslice_both_in1_V_U_ack_in),
-    .data_out(in1_V_TDATA_int_regslice),
-    .vld_out(in1_V_TVALID_int_regslice),
-    .ack_out(in1_V_TREADY_int_regslice),
-    .apdone_blk(regslice_both_in1_V_U_apdone_blk)
-);
-
-StreamingEltwise_hls_0_regslice_both #(
-    .DataWidth( 16 ))
-regslice_both_out_V_U(
-    .ap_clk(ap_clk),
-    .ap_rst(ap_rst_n_inv),
-    .data_in(out_V_TDATA_int_regslice),
-    .vld_in(out_V_TVALID_int_regslice),
-    .ack_in(out_V_TREADY_int_regslice),
-    .data_out(out_V_TDATA),
-    .vld_out(regslice_both_out_V_U_vld_out),
-    .ack_out(out_V_TREADY),
-    .apdone_blk(regslice_both_out_V_U_apdone_blk)
-);
-
-always @ (posedge ap_clk) begin
-    if (ap_rst_n_inv == 1'b1) begin
-        ap_CS_iter0_fsm <= ap_ST_iter0_fsm_state1;
-    end else begin
-        ap_CS_iter0_fsm <= ap_NS_iter0_fsm;
-    end
-end
-
-always @ (posedge ap_clk) begin
-    if (ap_rst_n_inv == 1'b1) begin
-        ap_CS_iter1_fsm <= ap_ST_iter1_fsm_state0;
-    end else begin
-        ap_CS_iter1_fsm <= ap_NS_iter1_fsm;
-    end
-end
-
-always @ (posedge ap_clk) begin
-    if ((1'b1 == ap_condition_50)) begin
-        i1_fu_50 <= i_fu_104_p2;
-    end
-end
-
-always @ (posedge ap_clk) begin
-    if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin
-        icmp_ln82_reg_133 <= icmp_ln82_fu_110_p2;
-    end
-end
-
-always @ (*) begin
-    if ((1'b1 == ap_block_state1_pp0_stage0_iter0)) begin
-        ap_ST_iter0_fsm_state1_blk = 1'b1;
-    end else begin
-        ap_ST_iter0_fsm_state1_blk = 1'b0;
-    end
-end
-
-always @ (*) begin
-    if ((1'b1 == ap_block_state2_pp0_stage0_iter1)) begin
-        ap_ST_iter1_fsm_state2_blk = 1'b1;
-    end else begin
-        ap_ST_iter1_fsm_state2_blk = 1'b0;
-    end
-end
-
-always @ (*) begin
-    if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (icmp_ln82_fu_110_p2 == 1'd1) & (1'b1 == ap_CS_iter0_fsm_state1))) begin
-        ap_condition_exit_pp0_iter0_stage0 = 1'b1;
-    end else begin
-        ap_condition_exit_pp0_iter0_stage0 = 1'b0;
-    end
-end
-
-always @ (*) begin
-    if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin
-        ap_ready_int = 1'b1;
-    end else begin
-        ap_ready_int = 1'b0;
-    end
-end
-
-always @ (*) begin
-    if (((ap_loop_init == 1'b1) & (1'b1 == ap_CS_iter0_fsm_state1))) begin
-        ap_sig_allocacmp_i1_load = 3'd0;
-    end else begin
-        ap_sig_allocacmp_i1_load = i1_fu_50;
-    end
-end
-
-always @ (*) begin
-    if ((1'b1 == ap_CS_iter0_fsm_state1)) begin
-        in0_V_TDATA_blk_n = in0_V_TVALID_int_regslice;
-    end else begin
-        in0_V_TDATA_blk_n = 1'b1;
-    end
-end
-
-always @ (*) begin
-    if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin
-        in0_V_TREADY_int_regslice = 1'b1;
-    end else begin
-        in0_V_TREADY_int_regslice = 1'b0;
-    end
-end
-
-always @ (*) begin
-    if ((1'b1 == ap_CS_iter0_fsm_state1)) begin
-        in1_V_TDATA_blk_n = in1_V_TVALID_int_regslice;
-    end else begin
-        in1_V_TDATA_blk_n = 1'b1;
-    end
-end
-
-always @ (*) begin
-    if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin
-        in1_V_TREADY_int_regslice = 1'b1;
-    end else begin
-        in1_V_TREADY_int_regslice = 1'b0;
-    end
-end
-
-always @ (*) begin
-    if (((1'b1 == ap_CS_iter1_fsm_state2) | (1'b1 == ap_CS_iter0_fsm_state1))) begin
-        out_V_TDATA_blk_n = out_V_TREADY_int_regslice;
-    end else begin
-        out_V_TDATA_blk_n = 1'b1;
-    end
-end
-
-always @ (*) begin
-    if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin
-        out_V_TVALID_int_regslice = 1'b1;
-    end else begin
-        out_V_TVALID_int_regslice = 1'b0;
-    end
-end
-
-always @ (*) begin
-    case (ap_CS_iter0_fsm)
-        ap_ST_iter0_fsm_state1 : begin
-            ap_NS_iter0_fsm = ap_ST_iter0_fsm_state1;
-        end
-        default : begin
-            ap_NS_iter0_fsm = 'bx;
-        end
-    endcase
-end
-
-always @ (*) begin
-    case (ap_CS_iter1_fsm)
-        ap_ST_iter1_fsm_state2 : begin
-            if (((1'b1 == ap_CS_iter0_fsm_state1) & (1'b0 == ap_block_state2_pp0_stage0_iter1) & (1'b0 == ap_block_state1_pp0_stage0_iter0))) begin
-                ap_NS_iter1_fsm = ap_ST_iter1_fsm_state2;
-            end else if (((1'b0 == ap_block_state2_pp0_stage0_iter1) & ((1'b0 == ap_CS_iter0_fsm_state1) | ((1'b1 == ap_CS_iter0_fsm_state1) & (1'b1 == ap_block_state1_pp0_stage0_iter0))))) begin
-                ap_NS_iter1_fsm = ap_ST_iter1_fsm_state0;
-            end else if (((icmp_ln82_reg_133_pp0_iter0_reg == 1'd1) & (1'b1 == ap_CS_iter1_fsm_state2) & (1'b0 == ap_block_state2_pp0_stage0_iter1))) begin
-                ap_NS_iter1_fsm = ap_ST_iter0_fsm_state1;
-            end else begin
-                ap_NS_iter1_fsm = ap_ST_iter1_fsm_state2;
-            end
-        end
-        ap_ST_iter1_fsm_state0 : begin
-            if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin
-                ap_NS_iter1_fsm = ap_ST_iter1_fsm_state2;
-            end else begin
-                ap_NS_iter1_fsm = ap_ST_iter1_fsm_state0;
-            end
-        end
-        default : begin
-            ap_NS_iter1_fsm = 'bx;
-        end
-    endcase
-end
-
-assign ap_CS_iter0_fsm_state1 = ap_CS_iter0_fsm[32'd0];
-
-assign ap_CS_iter1_fsm_state2 = ap_CS_iter1_fsm[32'd1];
-
-always @ (*) begin
-    ap_block_state1_pp0_stage0_iter0 = ((out_V_TREADY_int_regslice == 1'b0) | (in1_V_TVALID_int_regslice == 1'b0) | (in0_V_TVALID_int_regslice == 1'b0));
-end
-
-always @ (*) begin
-    ap_block_state2_pp0_stage0_iter1 = ((regslice_both_out_V_U_apdone_blk == 1'b1) | (out_V_TREADY_int_regslice == 1'b0));
-end
-
-always @ (*) begin
-    ap_condition_50 = (~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1));
-end
-
-always @ (*) begin
-    ap_rst_n_inv = ~ap_rst_n;
-end
-
-assign i_fu_104_p2 = (ap_sig_allocacmp_i1_load + 3'd1);
-
-assign icmp_ln82_fu_110_p2 = ((ap_sig_allocacmp_i1_load == 3'd7) ? 1'b1 : 1'b0);
-
-assign icmp_ln82_reg_133_pp0_iter0_reg = icmp_ln82_reg_133;
-
-assign in0_V_TREADY = regslice_both_in0_V_U_ack_in;
-
-assign in0_slice_channels_fu_81_p1 = in0_V_TDATA_int_regslice[3:0];
-
-assign in1_V_TREADY = regslice_both_in1_V_U_ack_in;
-
-assign outElem_fu_93_p2 = (zext_ln20_fu_85_p1 - zext_ln20_1_fu_89_p1);
-
-assign out_V_TDATA_int_regslice = outElem_fu_93_p2;
-
-assign out_V_TVALID = regslice_both_out_V_U_vld_out;
-
-assign zext_ln20_1_fu_89_p1 = in1_V_TDATA_int_regslice;
-
-assign zext_ln20_fu_85_p1 = in0_slice_channels_fu_81_p1;
-
-endmodule //StreamingEltwise_hls_0
diff --git a/finn_xsi/testcase/StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont.v b/finn_xsi/testcase/StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont.v
deleted file mode 100644
index e3ff4d1e48..0000000000
--- a/finn_xsi/testcase/StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont.v
+++ /dev/null
@@ -1,103 +0,0 @@
-// ==============================================================
-// Vitis HLS - High-Level Synthesis from C, C++ and OpenCL v2024.2 (64-bit)
-// Tool Version Limit: 2024.11
-// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
-// Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
-//
-// ==============================================================
-
-`timescale 1 ns / 1 ps
-
-module StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont(
-        ap_clk,
-        ap_rst,
-        ap_start,
-        ap_ready,
-        ap_done,
-        ap_start_int,
-        ap_ready_int,
-        ap_done_int,
-        ap_continue_int,
-        ap_loop_init,
-        ap_loop_exit_ready,
-        ap_loop_exit_done
-);
-
-input   ap_clk;
-input   ap_rst;
-
-//Block level handshake with outside loop
-input   ap_start;
-output  ap_ready;
-output  ap_done;
-
-//Block level handshake with loop body
-output  ap_start_int;
-input   ap_ready_int;
-input   ap_done_int;
-output  ap_continue_int;
-
-//Init live in variables
-output   ap_loop_init;
-reg ap_loop_init;
-reg ap_done;
-reg ap_done_cache;
-
-//Exit signal from loop body
-input   ap_loop_exit_ready;
-input   ap_loop_exit_done;
-
-// power-on initialization
-initial begin
-#0 ap_loop_init = 1'b1;
-#0 ap_done_cache = 1'b0;
-end
-
-assign ap_start_int = ap_start;
-
-assign ap_continue_int = 1'b1;
-
-assign ap_ready = ap_loop_exit_ready;
-
-//ap_loop_init is valid for the first II
-//of the first loop run so as to enable
-//the init block ops which are pushed into
-//the first state of the pipeline region
-always @ (posedge ap_clk)
-begin
-    if (ap_rst == 1'b1) begin
-        ap_loop_init <= 1'b1;
-    end else if(ap_loop_exit_ready == 1'b1) begin
-        ap_loop_init <= 1'b1;
-    end else if(ap_ready_int == 1'b1) begin
-        ap_loop_init <= 1'b0;
-    end
-end
-
-// if no ap_continue port and current module is not top module,
-// ap_done handshakes with ap_start. Internally, flow control sends out
-// ap_conintue_int = 1'b1 so the ap_done_int is asserted high for 1 clock cycle.
-// ap_done_cache is used to record ap_done_int, and de-assert if ap_start_int
-// is asserted, so DUT can start the next run
-always @(posedge ap_clk)
-begin
-    if (ap_rst == 1'b1) begin
-        ap_done_cache <= 1'b0;
-    end else if (ap_done_int == 1'b1) begin
-        ap_done_cache <= 1'b1;
-    end else if (ap_start_int == 1'b1) begin
-        ap_done_cache <= 1'b0;
-    end
-end
-
-// if no ap_continue port and current module is not top module, ap_done handshakes with ap_start
-always @(*)
-begin
-    if ((ap_done_int == 1'b1) || ((ap_done_cache == 1'b1) && (ap_start_int == 1'b0))) begin
-        ap_done = 1'b1;
-    end else begin
-        ap_done = 1'b0;
-    end
-end
-
-endmodule
diff --git a/finn_xsi/testcase/StreamingEltwise_hls_0_regslice_both.v b/finn_xsi/testcase/StreamingEltwise_hls_0_regslice_both.v
deleted file mode 100644
index c2e16007cc..0000000000
--- a/finn_xsi/testcase/StreamingEltwise_hls_0_regslice_both.v
+++ /dev/null
@@ -1,110 +0,0 @@
-// ==============================================================
-// Generated by Vitis HLS v2024.2
-// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
-// Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
-// ==============================================================
-`timescale 1ns/1ps
-
-module StreamingEltwise_hls_0_regslice_both
-#(parameter
-    DataWidth = 8
-) (
-    // system signals
-    input  wire                  ap_clk,
-    input  wire                  ap_rst,
-    // slave side
-    input  wire [DataWidth-1:0]  data_in,
-    input  wire                  vld_in,
-    output wire                  ack_in,
-    // master side
-    output wire [DataWidth-1:0]  data_out,
-    output wire                  vld_out,
-    input  wire                  ack_out,
-    output wire                  apdone_blk);
-    //------------------------Parameter----------------------
-    // state
-    localparam [1:0]
-        ZERO = 2'b10,
-        ONE  = 2'b11,
-        TWO  = 2'b01;
-    //------------------------Local signal-------------------
-    reg  [DataWidth-1:0] data_p1 = {DataWidth{1'b0}};
-    reg  [DataWidth-1:0] data_p2 = {DataWidth{1'b0}};
-    wire         load_p1;
-    wire         load_p2;
-    wire         load_p1_from_p2;
-    reg          ack_in_t = 1'b0;
-    reg  [1:0]   state = 2'b00;
-    reg  [1:0]   next;
-    //------------------------Body---------------------------
-    assign ack_in = ack_in_t;
-    assign data_out = data_p1;
-    assign vld_out = state[0];
-    assign apdone_blk = (state == ONE && ~ack_out) || (state == TWO);
-
-    assign load_p1 = (state == ZERO && vld_in) ||
-                    (state == ONE && vld_in && ack_out) ||
-                    (state == TWO && ack_out);
-    assign load_p2 = vld_in & ack_in;
-    assign load_p1_from_p2 = (state == TWO);
-
-    // data_p1
-    always @(posedge ap_clk) begin
-        if (load_p1) begin
-            if (load_p1_from_p2)
-                data_p1 <= data_p2;
-            else
-                data_p1 <= data_in;
-        end
-    end
-
-    // data_p2
-    always @(posedge ap_clk) begin
-        if (load_p2) data_p2 <= data_in;
-    end
-
-    // ack_in_t
-    always @(posedge ap_clk) begin
-        if (ap_rst)
-            ack_in_t <= 1'b0;
-        else if (state == ZERO)
-            ack_in_t <= 1'b1;
-        else if (state == ONE && next == TWO)
-            ack_in_t <= 1'b0;
-        else if (state == TWO && next == ONE)
-            ack_in_t <= 1'b1;
-    end
-
-    // state
-    always @(posedge ap_clk) begin
-        if (ap_rst)
-            state <= ZERO;
-        else
-            state <= next;
-    end
-
-    // next
-    always @(*) begin
-        case (state)
-            ZERO:
-                if (vld_in & ack_in)
-                    next = ONE;
-                else
-                    next = ZERO;
-            ONE:
-                if (~vld_in & ack_out)
-                    next = ZERO;
-                else if (vld_in & ~ack_out)
-                    next = TWO;
-                else
-                    next = ONE;
-            TWO:
-                if (ack_out)
-                    next = ONE;
-                else
-                    next = TWO;
-            default:
-                next = ZERO;
-        endcase
-    end
-endmodule
diff --git a/pyproject.toml b/pyproject.toml
index 6577b95938..4f03a52311 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -183,5 +183,6 @@ ignore = [
     "ANN401",   # Don't use the "Any" type
     "D413",     # Blank lines at docstring end
     "D205",     # Blank line after summary (enables multiline summaries)
-    "N801",     # Class names should use CapWords convention
+    "N801",     # Class name should use CapWords convention
+    "D209",     # Multi-line docstring closing quotes should be on a separate line
 ]
diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index 3e596add9d..499fa721cb 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -102,9 +102,12 @@ def get_default_session_options_new():
             is_followup = True
             save_dir = save_dir + "_followup"
         else:
-            config_path = os.path.join("ci", "cfg", config_name + ".yml")
+            if config_name.endswith(".yaml") or config_name.endswith(".yml"):
+                config_path = config_name
+            else:
+                config_path = os.path.join("ci", "cfg", config_name + ".yml")
         print("Job launched with SLURM ID: %d" % (job_id))
-    except KeyError:
+    except KeyError as e:
         # Launched without SLURM, assume test run on local machine
         job_id = 0
         experiment_dir = "bench_output/" + time.strftime("%d_%H_%M")
diff --git a/src/finn/benchmarking/dut/resnet18.yml b/src/finn/benchmarking/dut/resnet18.yml
index f427c33e83..fb8a6589fe 100644
--- a/src/finn/benchmarking/dut/resnet18.yml
+++ b/src/finn/benchmarking/dut/resnet18.yml
@@ -1,3 +1,7 @@
+model_path: models/resnet18/resnet18_w3a3_cifar100.onnx
+folding_config_file: models/resnet18/resnet18_folding_config.json
+specialize_layers_config_file: models/resnet18/resnet18_specialize_layers.json
+
 steps:
   - step_qonnx_to_finn
   - step_tidy_up
@@ -11,13 +15,13 @@ steps:
   - step_apply_folding_config
   - step_minimize_bit_width
   - step_generate_estimate_reports
-  - step_build_simulation
-  - step_size_fifo_connected
-  - step_apply_fifosizes
-  - step_generate_estimate_reports
+  - step_set_fifo_depths
   - step_hw_codegen
   - step_hw_ipgen
   - step_create_stitched_ip
   - step_synthesize_bitfile
   - step_make_driver
   - step_deployment_package
+
+# Required to use RTL MVAUs
+standalone_thresholds: true
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 3f4caa26ea..2114706642 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -52,6 +52,13 @@
 from typing import Any, TextIO
 
 import finn.util.logging
+from finn.util.exception import (
+    FINNConfigurationError,
+    FINNDataflowError,
+    FINNError,
+    FINNUserError,
+)
+from finn.util.exception_snapshot import snapshot_on_exception
 from finn.builder.build_dataflow_config import (
     DataflowBuildConfig,
     LogLevel,
@@ -60,8 +67,6 @@
 )
 from finn.builder.build_dataflow_steps import build_dataflow_step_lookup
 from finn.util.basic import get_vivado_root
-from finn.util.exception import FINNConfigurationError, FINNDataflowError, FINNError, FINNUserError
-from finn.util.exception_snapshot import snapshot_on_exception
 from finn.util.logging import log
 from finn.util.settings import get_settings
 
@@ -345,7 +350,7 @@ def create_model_wrapper(model_filename: str, cfg: DataflowBuildConfig) -> Model
         f"Building dataflow accelerator from intermediate"
         f" checkpoint {intermediate_model_filename}"
     )
-    return ModelWrapper(intermediate_model_filename)
+    return ModelWrapper(str(intermediate_model_filename))
 
 
 def build_dataflow_cfg(model_filename: str, cfg: DataflowBuildConfig) -> int:
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 32f28add90..75d4aa3468 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -97,6 +97,7 @@ class AutoFIFOSizingMethod(str, Enum):
 
     CHARACTERIZE = "characterize"
     LARGEFIFO_RTLSIM = "largefifo_rtlsim"
+    DISTRIBUTED_SIMULATION = "distributed_sim"
 
 
 class ShellFlowType(str, Enum):
@@ -455,6 +456,10 @@ def _fix_path(p: Path | None) -> Path | None:
     #: Enables experimental live FIFO sizing on the FPGA.
     live_fifo_sizing: bool = False
 
+    #: Whether to use functional simulation when available. Takes some time
+    #: to synthesize, but results in much faster simulations.
+    functional_simulation: bool = True
+
     #: Whether FIFO nodes with depth larger than 32768 will be split.
     #: Allow to configure very large FIFOs in the folding_config_file.
     split_large_fifos: bool = False
@@ -638,6 +643,10 @@ def _resolve_fpga_part(self) -> str:
         """
         if self.fpga_part is None:
             # lookup from part map if not specified
+            if self.board is None:
+                raise FINNConfigurationError(
+                    "Either board or fpga_part must be specified in flow config."
+                )
             try:
                 fpga_part = part_map[self.board]
                 return fpga_part
@@ -715,11 +724,11 @@ def _resolve_verification_io_pair(self) -> None | tuple[Any, Any]:
         if self.verify_steps is None:
             return None
         if not Path(self.verify_input_npy).is_file():
-            raise FINNConfigurationError("verify_input_npy not found: " + self.verify_input_npy)
+            raise FINNConfigurationError("verify_input_npy not found: " + str(self.verify_input_npy))
         verify_input_npy = np.load(self.verify_input_npy)
         if not Path(self.verify_expected_output_npy).is_file():
             raise FINNConfigurationError(
-                "verify_expected_output_npy not found: " + self.verify_expected_output_npy
+                "verify_expected_output_npy not found: " + str(self.verify_expected_output_npy)
             )
         verify_expected_output_npy = np.load(self.verify_expected_output_npy)
         return (
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index a0d969f8c5..f9fd44d50b 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -69,6 +69,7 @@
     DataflowOutputType,
     ShellFlowType,
     VerificationStepType,
+    AutoFIFOSizingMethod
 )
 from finn.builder.passes import step_passes_frontend
 from finn.core.onnx_exec import execute_onnx
@@ -683,6 +684,61 @@ def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
     return model
 
 
+# TODO: Both this and the step_size_... steps will be reworked before merging into dev
+# TODO: These are also included in step_set_fifo_depths if the correct FIFO sizing method
+# was selected
+def step_build_simulation(model: ModelWrapper, cfg: DataflowBuildConfig) -> ModelWrapper:
+    """Build the simulation binaries for isolated and connected simulations."""
+    from finn.transformation.fpgadataflow.simulation_build import BuildSimulation
+
+    model = model.transform(
+        BuildSimulation(
+            cfg._resolve_fpga_part(),  # noqa
+            cfg._resolve_hls_clk_period(),  # noqa
+            cfg.functional_simulation,
+        )
+    )
+    return model
+
+
+def step_size_fifo_isolated(model: ModelWrapper, cfg: DataflowBuildConfig) -> ModelWrapper:
+    """Simulate layers in isolation and use the observed behaviour to size the FIFOs accordingly."""
+    from pathlib import Path
+
+    from finn.transformation.fpgadataflow.simulation_isolated import RunLayerIsolatedSimulation
+
+    model = model.transform(
+        RunLayerIsolatedSimulation(
+            cfg._resolve_fpga_part(),  # noqa
+            cfg._resolve_hls_clk_period(),  # noqa
+            cfg.functional_simulation,
+            Path(cfg.output_dir),
+        )
+    )
+    return model
+
+
+def step_size_fifo_connected(model: ModelWrapper, cfg: DataflowBuildConfig) -> ModelWrapper:
+    """Simulate layers connected and use the observed behaviour to size the FIFOs accordingly."""
+    from finn.transformation.fpgadataflow.simulation_connected import RunLayerParallelSimulation
+
+    model = model.transform(
+        RunLayerParallelSimulation(
+            cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period(), cfg  # noqa  # noqa
+        )
+    )
+    return model
+
+
+def step_apply_fifosizes(model: ModelWrapper, cfg: DataflowBuildConfig) -> ModelWrapper:
+    """Apply the previously found FIFO sizes to the model."""
+    from finn.transformation.fpgadataflow.simulation import ApplyFIFOSizes
+
+    model = model.transform(ApplyFIFOSizes(cfg))
+    model = model.transform(SplitLargeFIFOs(max_qsrl_depth=256))
+    return model
+
+
 def step_insert_dwc(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Inserts data width converters between layers where necessary."""
     model = model.transform(InsertDWC())
@@ -828,6 +884,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             )
             # InsertAndSetFIFODepths internally removes any shallow FIFOs
             # so no need to call RemoveShallowFIFOs here
+        elif cfg.auto_fifo_strategy == AutoFIFOSizingMethod.DISTRIBUTED_SIMULATION:
+            # TODO: When merging into dev, this should be finalized
+            model = step_build_simulation(model, cfg)
+            model = step_size_fifo_connected(model, cfg)
+            model = step_apply_fifosizes(model, cfg)
+            return model
         else:
             assert "Unsupported auto_fifo_strategy: " + cfg.auto_fifo_strategy
     else:
@@ -1234,7 +1296,10 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
     "step_generate_estimate_reports": step_generate_estimate_reports,
     "step_hw_codegen": step_hw_codegen,
     "step_hw_ipgen": step_hw_ipgen,
-    "step_insert_dwc": step_insert_dwc,
+    "step_build_simulation": step_build_simulation,
+    "step_size_fifo_isolated": step_size_fifo_isolated,
+    "step_size_fifo_connected": step_size_fifo_connected,
+    "step_apply_fifosizes": step_apply_fifosizes,
     "step_set_fifo_depths": step_set_fifo_depths,
     "step_create_stitched_ip": step_create_stitched_ip,
     "step_measure_rtlsim_performance": step_measure_rtlsim_performance,
diff --git a/src/finn/builder/custom_step_library/resnet.py b/src/finn/builder/custom_step_library/resnet.py
index 02d7f5ff58..1781bd8636 100644
--- a/src/finn/builder/custom_step_library/resnet.py
+++ b/src/finn/builder/custom_step_library/resnet.py
@@ -34,35 +34,69 @@
 hardware conversion.
 """
 
+from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
 from qonnx.transformation.composed import ComposedTransformation
 from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import (
+    ConvertDivToMul,
+    ConvertSubToAdd,
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     GiveUniqueParameterTensors,
+    RemoveStaticGraphInputs,
     RemoveUnusedTensors,
     SortGraph,
 )
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.remove import RemoveIdentityOps
 
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
 from finn.builder.build_dataflow_config import DataflowBuildConfig
 from finn.transformation.fpgadataflow.replicate_stream import InferReplicateStream
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 from finn.transformation.streamline.absorb import (
+    Absorb1BitMulIntoConv,
+    Absorb1BitMulIntoMatMul,
     AbsorbAddIntoMultiThreshold,
+    AbsorbConsecutiveTransposes,
+    AbsorbMulIntoMultiThreshold,
+    AbsorbScalarMulAddIntoTopK,
     AbsorbSignBiasIntoMultiThreshold,
     AbsorbTransposeIntoMultiThreshold,
+    FactorOutMulSignMagnitude,
+)
+from finn.transformation.streamline.collapse_repeated import (
+    CollapseRepeatedAdd,
+    CollapseRepeatedMul,
 )
 from finn.transformation.streamline.remove import RemoveIdentityReshape, RemoveIdentityTranspose
 
 # just for not linear
-from finn.transformation.streamline.reorder import MoveMulPastAdd
+# just for not linear
+from finn.transformation.streamline.reorder import (
+    MoveAddPastConv,
+    MoveAddPastMul,
+    MoveLinearPastEltwiseAdd,
+    MoveLinearPastFork,
+    MoveMaxPoolPastMultiThreshold,
+    MoveMulPastAdd,
+    MoveScalarAddPastMatMul,
+    MoveScalarLinearPastInvariants,
+    MoveScalarMulPastConv,
+    MoveScalarMulPastMatMul,
+    MoveTransposePastEltwise,
+    MoveTransposePastFork,
+    MoveTransposePastJoinAdd,
+)
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+from finn.transformation.streamline.sign_to_thres import ConvertSignToThres
 from finn.transformation.streamline.streamline_plus import StreamlinePlus as Streamline
 
 
@@ -144,3 +178,155 @@ def step_resnet_convert_to_hw(
     model = model.transform(RemoveUnusedTensors())
     model = model.transform(SortGraph())
     return model
+
+
+# For backwards compatibility
+
+
+def step_resnet50_tidy(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Tidy up ResNet-50 models (backwards-compatible legacy step).
+
+    Applies shape and datatype inference, constant folding, unique naming, and
+    inserts a TopK layer at the output.
+    """
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(RemoveStaticGraphInputs())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InsertTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+def step_resnet50_streamline_linear(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Apply linear streamlining transformations to a ResNet-50 model.
+
+    Moves and absorbs scalar linear operations (mul, add) past convolutions and
+    matrix multiplications, collapses repeated operations, converts sign nodes
+    to thresholds, and absorbs values into multithreshold nodes.
+    """
+    streamline_transformations = [
+        AbsorbScalarMulAddIntoTopK(),  # before MoveAddPastMul to avoid int->float
+        ConvertSubToAdd(),
+        ConvertDivToMul(),
+        RemoveIdentityOps(),
+        CollapseRepeatedMul(),
+        BatchNormToAffine(),
+        ConvertSignToThres(),
+        MoveAddPastMul(),
+        MoveScalarAddPastMatMul(),
+        MoveAddPastConv(),
+        MoveScalarMulPastMatMul(),
+        MoveScalarMulPastConv(),
+        MoveScalarLinearPastInvariants(),
+        MoveAddPastMul(),
+        CollapseRepeatedAdd(),
+        CollapseRepeatedMul(),
+        AbsorbAddIntoMultiThreshold(),
+        FactorOutMulSignMagnitude(),
+        MoveMaxPoolPastMultiThreshold(),
+        AbsorbMulIntoMultiThreshold(),
+        Absorb1BitMulIntoMatMul(),
+        Absorb1BitMulIntoConv(),
+        RoundAndClipThresholds(),
+    ]
+    for trn in streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+    return model
+
+
+def step_resnet50_streamline_nonlinear(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Apply non-linear streamlining transformations to a ResNet-50 model.
+
+    Moves linear operations past elementwise-add nodes and fork points to
+    enable further fusion in subsequent linear streamlining passes.
+    """
+    streamline_transformations = [
+        MoveLinearPastEltwiseAdd(),
+        MoveLinearPastFork(),
+    ]
+    for trn in streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+    return model
+
+
+def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Streamline a ResNet-50 model (backwards-compatible legacy step).
+
+    Iterates linear and non-linear streamlining passes, then lowers convolutions
+    to matrix multiplications and absorbs the resulting transpose operations.
+    """
+    for iter_id in range(4):
+        model = step_resnet50_streamline_linear(model, cfg)
+        model = step_resnet50_streamline_nonlinear(model, cfg)
+
+        # big loop tidy up
+        model = model.transform(RemoveUnusedTensors())
+        model = model.transform(GiveReadableTensorNames())
+        model = model.transform(InferDataTypes())
+        model = model.transform(SortGraph())
+
+    model = model.transform(DoubleToSingleFloat())
+
+    # Lower convolutions and streamline resulting transposes
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(
+        ComposedTransformation(
+            [
+                MoveTransposePastJoinAdd(),
+                MoveTransposePastFork(),
+                MoveTransposePastEltwise(),
+                AbsorbConsecutiveTransposes(),
+                AbsorbTransposeIntoMultiThreshold(),
+            ]
+        )
+    )
+    return model
+
+
+def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
+    """Convert a ResNet-50 model to hardware-specific operations (backwards-compatible legacy step).
+
+    Sets the input datatype to UINT8, then sequentially converts channelwise
+    linear layers, pooling, matrix-vector activations, thresholding, convolution
+    input generators, stream duplication/addition, and label selection to their
+    corresponding HLS hardware layer variants.
+    """
+    model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"])
+    model = model.transform(InferDataLayouts())
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferDataTypes())
+    model = model.transform(SortGraph())
+
+    to_hw_transformations = [
+        to_hw.InferChannelwiseLinearLayer,
+        to_hw.InferPool,
+        AbsorbConsecutiveTransposes,
+        RoundAndClipThresholds,
+        to_hw.InferQuantizedMatrixVectorActivation,
+        to_hw.InferThresholdingLayer,
+        to_hw.InferConvInpGen,
+        to_hw.InferDuplicateStreamsLayer,
+        to_hw.InferAddStreamsLayer,
+        to_hw.InferLabelSelectLayer,
+    ]
+    for trn in to_hw_transformations:
+        model = model.transform(trn())
+        model = model.transform(InferDataLayouts())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(InferDataTypes())
+
+    model = model.transform(RemoveCNVtoFCFlatten())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(RemoveUnusedTensors())
+    model = model.transform(SortGraph())
+
+    return model
diff --git a/src/finn/builder/custom_step_library/transformer_adhoc.py b/src/finn/builder/custom_step_library/transformer_adhoc.py
index cbcd28a916..70aae9d59e 100644
--- a/src/finn/builder/custom_step_library/transformer_adhoc.py
+++ b/src/finn/builder/custom_step_library/transformer_adhoc.py
@@ -150,7 +150,7 @@ def _set_folding_attention(model: ModelWrapper, target_cycles_per_frame):
             # parallelism in steps following the common divisors the inputs.
             for fold in reversed(common_divisors([qkdim, vdim])):
                 # Configure the folding attribute
-                inst.set_nodeattr("EmbFold", fold)
+                inst.set_nodeattr("EmbFold", int(fold))
                 # Check if this is sufficient to meet the cycles target
                 if inst.get_exp_cycles() <= target_cycles_per_frame:
                     break
@@ -159,7 +159,7 @@ def _set_folding_attention(model: ModelWrapper, target_cycles_per_frame):
             # parallelism in steps divisors of the key and value sequence.
             for fold in reversed(common_divisors([kvlen])):
                 # Configure the folding attribute
-                inst.set_nodeattr("SeqFold", fold)
+                inst.set_nodeattr("SeqFold", int(fold))
                 # Check if this is sufficient to meet the cycles target
                 if inst.get_exp_cycles() <= target_cycles_per_frame:
                     break
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 9671ed3d71..f8a08260ca 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -29,6 +29,9 @@
 
 import numpy as np
 import os
+import shlex
+import subprocess
+import sys
 from pathlib import Path
 from qonnx.custom_op.registry import getCustomOp
 from subprocess import CalledProcessError
@@ -41,7 +44,8 @@
     make_build_dir,
 )
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-from finn.util.exception import FINNConfigurationError, FINNError, FINNInternalError
+from finn.util.logging import log
+from finn.util.exception import FINNConfigurationError, FINNError, FINNInternalError, FINNUserError
 
 finnxsi = xsi if xsi.is_available() else None
 
@@ -126,6 +130,10 @@ def file_to_basename(x: str | Path) -> str:
 def rtlsim_exec_cppxsi(
     model,
     execution_context,
+    is_single_node: bool,
+    total_nodes: int = 1,
+    current_node_index: int | None = None,
+    previous_node_name: str | None = None,
     dummy_data_mode=False,
     timeout_cycles=None,
     throttle_cycles=0,
@@ -181,7 +189,8 @@ def rtlsim_exec_cppxsi(
         vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
         with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f:
             all_verilog_srcs = f.read().split()
-        single_src_dir = make_build_dir("rtlsim_" + top_module_name + "_")
+        rtlsim_name = model.graph.node[0].name if is_single_node else top_module_name
+        single_src_dir = make_build_dir("rtlsim_" + rtlsim_name + "_")
         debug = not (trace_file is None or trace_file == "")
         rtlsim_so = finnxsi.compile_sim_obj(
             top_module_name, all_verilog_srcs, single_src_dir, debug=debug, behav=True
@@ -212,6 +221,7 @@ def rtlsim_exec_cppxsi(
             raise FINNInternalError("The finn_xsi directory could not be found. Stopping here.")
 
     # prepare the C++ sim driver template
+    finnxsi_dir = os.environ["FINN_XSI"]
     with fifosim_config_fname.open() as f:
         fifsom_config_template = f.read()
 
@@ -273,62 +283,72 @@ def rtlsim_exec_cppxsi(
         "TOP_MODULE_NAME": top_module_name,
         # top-level AXI stream descriptors
         "ISTREAM_DESC": instream_descrs_str,
+        "ISTREAM_LEN": len(instream_names),
         "OSTREAM_DESC": outstream_descrs_str,
+        "OSTREAM_LEN": len(outstream_names),
         # control tracing and trace filename
         "TRACE_FILE": "nullptr" if trace_file is None else f'"{trace_file}"',
         # sim kernel .so to use (depends on Vivado version)
         "SIMKERNEL_SO": finnxsi.get_simkernel_so(),
         # log file for xsi (not the sim driver)
         "XSIM_LOG_FILE": '"xsi.log"',
+        # Node name in case of single-node simulation
+        "NODE_NAME": model.graph.node[0].name,
+        # Previous node name (for single node simulation)
+        "PREVIOUS_NODE_NAME": "std::nullopt"
+        if previous_node_name is None
+        else f'"{previous_node_name}"',
+        "NODE_INDEX": current_node_index if is_single_node else 0,
+        "TOTAL_NODES": total_nodes,
     }
+
+    fifosim_config_fname = Path(finnxsi_dir) / "rtlsim_config.hpp.template"
+    fsim_config = fifosim_config_fname.read_text()
     for key, val in template_dict.items():
-        fifsom_config_template = fifsom_config_template.replace(f"@{key}@", str(val))
-    with open(sim_base + "/rtlsim_config.hpp", "w") as f:
-        f.write(fifsom_config_template)
-
-    vivado_incl_dir = get_vivado_root() + "/data/xsim/include"
-    # launch g++ to compile the rtlsim executable
-    build_cmd = [
-        "g++",
-        f"-I{finnxsi_dir}",
-        f"-I{vivado_incl_dir}",
-        f"-I{sim_base}",
-        "-std=c++17",
-        "-O3",
-        "-o",
-        "rtlsim_xsi",
-        f"{finnxsi_dir}/rtlsim_xsi.cpp",
-        f"{finnxsi_dir}/xsi_finn.cpp",
-        "-ldl",
-        "-lrt",
-    ]
-    # write compilation command to a file for easy re-running/debugging
-    with open(sim_base + "/compile_rtlsim.sh", "w") as f:
-        f.write(" ".join(build_cmd))
+        fsim_config = fsim_config.replace(f"@{key}@", str(val))
+
+    # Write the config to the simulation directory
+    rtlsim_config = Path(sim_base) / "rtlsim_config.hpp"
+    rtlsim_config.write_text(fsim_config)
+
+    # Building the whole simulation
+    # Running CMake first
+    cmake_call = f"{sys.executable} -m cmake -S {finnxsi_dir} -B {sim_base}"
+    log.info(f"Running cmake on RTLSIM Wrapper in {sim_base}")
     try:
-        launch_process_helper(build_cmd, cwd=sim_base, print_stdout=False)
-    except CalledProcessError:
-        raise FINNError("Failed to compile rtlsim executable")
-    if not os.path.isfile(sim_base + "/rtlsim_xsi"):
-        raise FINNError("Failed to compile rtlsim executable")
-
-    # launch the rtlsim executable
-    runsim_cmd = ["bash", "run_rtlsim.sh"]
-    with open(sim_base + "/run_rtlsim.sh", "w") as f:
-        f.write("./rtlsim_xsi > rtlsim_xsi_log.txt")
-    launch_process_helper(runsim_cmd, cwd=sim_base)
+        launch_process_helper(
+            shlex.split(cmake_call), cwd=finnxsi_dir, print_stdout=True, proc_env=os.environ.copy()
+        )
+    except CalledProcessError as e:
+        raise FINNError(f"Failed to run cmake in {sim_base}") from e
+
+    # Calling make to actually build the simulation
+    makefile = Path(sim_base) / "Makefile"
+    if not makefile.exists():
+        raise FINNUserError(f"Failed to create Makefile in {sim_base}!")
+    try:
+        launch_process_helper(["make"], proc_env=os.environ.copy(), cwd=sim_base)
+    except CalledProcessError as e:
+        raise FINNUserError(f"Failed to create executable in {sim_base}!") from e
+
+    # TODO: Fix name for general rtlsim
+    simulation_executable = Path(sim_base) / "LayerSimulationBackend"
+    assert simulation_executable.exists()
+
+    # Prepare the script to run the simulation
+    # (important to specify LD_LIBRARY_PATH here for XSI to work correctly)
+    runsim = Path(sim_base) / "run_fifosim.sh"
+    ld_library_path = get_vivado_root() + "/lib/lnx64.o"
+    runsim.write_text(f"LD_LIBRARY_PATH={ld_library_path}:$LD_LIBRARY_PATH {simulation_executable}")
+
+    # Actually run the simulation
+    subprocess.run(
+        ["bash", runsim.name], cwd=sim_base, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
 
     # parse results file and return dict
-    results_filename = sim_base + "/results.txt"
-    with open(results_filename, "r") as f:
-        results = f.read().strip().split("\n")
-    ret_dict = {}
-    for result_line in results:
-        key, val = result_line.split("\t")
-        ret_dict[key] = int(val)
-    if "TIMEOUT" in ret_dict.keys():
-        assert ret_dict["TIMEOUT"] == 0, f"XSI C++ simulation timed out, see {results_filename}"
-    return ret_dict
+    # TODO
+    return {}
 
 
 def rtlsim_exec_finnxsi(model, execution_context, pre_hook=None, post_hook=None):
diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
index 3c2203b057..d33629b43e 100644
--- a/src/finn/custom_op/fpgadataflow/elementwise_binary.py
+++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py
@@ -35,13 +35,14 @@
 
 from finn.custom_op.fpgadataflow import register_custom_op
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.custom_op.fpgadataflow.memstream import MemStreamSupport
 
 # FINN logging
 from finn.util.logging import log
 
 
 # Generic implementation for elementwise binary operations
-class ElementwiseBinaryOperation(HWCustomOp):
+class ElementwiseBinaryOperation(MemStreamSupport, HWCustomOp):
     # Specifies the elementwise operation to be implemented
     #   Format: (Identifier, Python, C++, RTL)
     _operation: tuple[str, np.ufunc, str, str] | None = None
diff --git a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py
index 14ef567404..72bc3bd973 100644
--- a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py
@@ -35,7 +35,7 @@
 from finn.util.logging import log
 
 
-class CheckSum_hls(HWCustomOp, HLSBackend):
+class CheckSum_hls(HLSBackend, HWCustomOp):
     """Class that corresponds to custom_hls checksum function."""
 
     def __init__(self, onnx_node, **kwargs):
diff --git a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
index bf239dd056..6fb54ddcff 100644
--- a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
@@ -74,7 +74,7 @@
 #       -the folded shape is not defined
 
 
-class IODMA_hls(HWCustomOp, HLSBackend):
+class IODMA_hls(HLSBackend, HWCustomOp):
     """Class that corresponds to finn-hlslib DMA function(s)."""
 
     def __init__(self, onnx_node, **kwargs):
diff --git a/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py
index 610dd2f6ef..34f93705af 100644
--- a/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py
@@ -31,7 +31,7 @@
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 
 
-class TLastMarker_hls(HWCustomOp, HLSBackend):
+class TLastMarker_hls(HLSBackend, HWCustomOp):
     """Node that adds/removes AXI stream TLAST signals where needed. Its behavior
     is transparent in node-by-node execution, only visible in IP-stitched rtlsim or
     actual hardware.
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index 5da49c4d98..653aa5a3da 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -36,6 +36,7 @@
 
 from finn import xsi
 from finn.custom_op.fpgadataflow import templates
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 from finn.templates import get_templates_folder
 from finn.util.basic import CppBuilder, launch_process_helper, make_build_dir
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
@@ -47,7 +48,7 @@
 finnxsi = xsi if xsi.is_available() else None
 
 
-class HLSBackend(ABC):
+class HLSBackend(HWCustomOp, ABC):
     """HLSBackend class all custom ops that correspond to a finn-hlslib
     function are using functionality of. Contains different functions every HLS
     custom node should have. Some as abstract methods, these have to be filled
@@ -55,15 +56,19 @@ class HLSBackend(ABC):
 
     def get_nodeattr_types(self):
         """Return dictionary of node attribute types and properties."""
-        return {
-            "code_gen_dir_cppsim": ("s", False, ""),
-            "executable_path": ("s", False, ""),
-            "res_hls": ("s", False, ""),
-            # temporary node attribute to keep track of interface style of hls ops
-            "cpp_interface": ("s", False, "packed", {"packed", "hls_vector"}),
-            # temporary node attribute to keep track of execution style of hls ops
-            "hls_style": ("s", False, "ifm_aware", {"ifm_aware", "freerunning"}),
-        }
+        super_types = super().get_nodeattr_types()
+        super_types.update(
+            {
+                "code_gen_dir_cppsim": ("s", False, ""),
+                "executable_path": ("s", False, ""),
+                "res_hls": ("s", False, ""),
+                # temporary node attribute to keep track of interface style of hls ops
+                "cpp_interface": ("s", False, "packed", {"packed", "hls_vector"}),
+                # temporary node attribute to keep track of execution style of hls ops
+                "hls_style": ("s", False, "ifm_aware", {"ifm_aware", "freerunning"}),
+            }
+        )
+        return super_types
 
     def get_all_verilog_paths(self):
         """Return list of all folders containing Verilog code for this node."""
diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py
index ce967d1d47..d6132f68ab 100644
--- a/src/finn/custom_op/fpgadataflow/hwcustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py
@@ -45,8 +45,9 @@
 from qonnx.util.basic import roundup_to_integer_multiple
 from typing import TYPE_CHECKING, Any, cast
 
-from finn import xsi
 from finn.util.basic import get_liveness_threshold_cycles, is_versal
+from finn.util.deprecated import deprecated
+from finn import xsi
 from finn.util.exception import FINNInternalError
 from finn.util.logging import log
 from finn.util.settings import get_settings
@@ -56,6 +57,9 @@
 
 finnxsi = xsi if xsi.is_available() else None
 
+if TYPE_CHECKING:
+    from qonnx.core.modelwrapper import ModelWrapper
+
 
 class HWCustomOp(CustomOp):
     """HWCustomOp class all custom ops that can be implemented with either
@@ -334,10 +338,9 @@ def rtlsim_multi_io(self, sim: SimEngine, io_dict: dict[str, Any], sname: str =
     def verify_node(self) -> None:
         """Can be implemented to verify that all attributes the node needs
         are there and that particular attributes are set correctly. Can also
-        check if the number of inputs is equal to the expected number.
-        """
+        check if the number of inputs is equal to the expected number."""
 
-    def generate_params(self, model: "ModelWrapper", path: str) -> None:
+    def generate_params(self, model: Any, path: str) -> None:
         """Generate parameters (i.e. weights and thresholds).
 
         Member function of HWCustomOp class that must be implemented by every node
@@ -514,6 +517,7 @@ def generate_hdl_dynload(self) -> None:
         with output_path.open("w") as f:
             f.write(template_wrapper)
 
+    @deprecated
     def derive_characteristic_fxns(
         self, period: int, override_rtlsim_dict: dict | None = None
     ) -> None:
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 4f2962a6d1..c87412a2d2 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -48,6 +48,7 @@
 )
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.custom_op.fpgadataflow.memstream import MemStreamSupport
 from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string
 from finn.util.logging import log
 from finn.util.settings import get_settings
@@ -60,7 +61,7 @@
 # the ... here can be any shape (representing groups of vectors)
 
 
-class MVAU(HWCustomOp):
+class MVAU(MemStreamSupport, HWCustomOp):
     """Abstraction layer for HW implementation of MatrixVectorActivation layers."""
 
     def __init__(self, onnx_node, **kwargs):
diff --git a/src/finn/custom_op/fpgadataflow/memstream.py b/src/finn/custom_op/fpgadataflow/memstream.py
new file mode 100644
index 0000000000..ee6305f26f
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/memstream.py
@@ -0,0 +1,71 @@
+"""Support for memory stream operations in FPGA dataflow."""
+
+import os
+from pathlib import Path
+from typing import cast
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import is_versal
+
+
+class MemStreamSupport(HWCustomOp):
+    """Custom Op for memory stream operations in FPGA dataflow."""
+
+    def calc_tmem(self) -> int:
+        """Abstract method to calculate threshold memory size.
+        The default implementation raises NotImplementedError because
+        some subclasses dont implement calc_tmem."""
+        raise NotImplementedError()
+
+    def calc_wmem(self) -> int:
+        """Abstract method to calculate weight memory size.
+        The default implementation raises NotImplementedError because
+        some subclasses dont implement calc_wmem."""
+        raise NotImplementedError()
+
+    def generate_hdl_memstream(self, fpgapart: str, pumped_memory: int = 0) -> None:
+        """Generate verilog code for memstream component.
+
+        Currently utilized by MVAU, VVAU and HLS Thresholding layer.
+
+        Args:
+            fpgapart: Target FPGA part string.
+            pumped_memory: Whether to use pumped memory (default: 0).
+
+        """
+        ops = ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl", "Thresholding_hls"]
+        if self.onnx_node.op_type in ops or self.onnx_node.op_type.startswith("Elementwise"):
+            template_path = (
+                Path(os.environ["FINN_RTLLIB"]) / "memstream/hdl/memstream_wrapper_template.v"
+            )
+            mname = self.onnx_node.name
+            if self.onnx_node.op_type.startswith("Thresholding"):
+                depth = self.calc_tmem()
+            else:
+                depth = self.calc_wmem()
+            padded_width = self.get_instream_width_padded(1)
+            code_gen_dir = cast("str", self.get_nodeattr("code_gen_dir_ipgen"))
+
+            ram_style = cast("str", self.get_nodeattr("ram_style"))
+            init_file = str(Path(code_gen_dir) / "memblock.dat")
+            if ram_style == "ultra" and not is_versal(fpgapart):
+                init_file = ""
+            code_gen_dict = {
+                "$MODULE_NAME$": [mname],
+                "$SETS$": ["1"],
+                "$DEPTH$": [str(depth)],
+                "$WIDTH$": [str(padded_width)],
+                "$INIT_FILE$": [init_file],
+                "$RAM_STYLE$": [ram_style],
+                "$PUMPED_MEMORY$": [str(pumped_memory)],
+            }
+            # apply code generation to template
+            with template_path.open() as f:
+                template_wrapper = f.read()
+            for key in code_gen_dict:
+                # transform list into long string separated by '\n'
+                code_gen_line = "\n".join(code_gen_dict[key])
+                template_wrapper = template_wrapper.replace(key, code_gen_line)
+            output_path = Path(code_gen_dir) / f"{mname}_memstream_wrapper.v"
+            with output_path.open("w") as f:
+                f.write(template_wrapper)
diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
index 0f56733541..72befc53ea 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
@@ -58,6 +58,7 @@ def register_custom_op(cls):
 from finn.custom_op.fpgadataflow.rtl.inner_shuffle_rtl import InnerShuffle_rtl
 from finn.custom_op.fpgadataflow.rtl.layernorm_rtl import LayerNorm_rtl
 from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MVAU_rtl
+from finn.custom_op.fpgadataflow.rtl.removedatapath_rtl import RemoveDataPath_rtl
 from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import (
     StreamingDataWidthConverter_rtl,
 )
@@ -75,4 +76,5 @@ def register_custom_op(cls):
 custom_op["MVAU_rtl"] = MVAU_rtl
 custom_op["VVAU_rtl"] = VVAU_rtl
 custom_op["Thresholding_rtl"] = Thresholding_rtl
+custom_op["RemoveDataPath_rtl"] = RemoveDataPath_rtl
 custom_op["InnerShuffle_rtl"] = InnerShuffle_rtl
diff --git a/src/finn/custom_op/fpgadataflow/rtl/removedatapath_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/removedatapath_rtl.py
new file mode 100644
index 0000000000..aa7a7c0004
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/rtl/removedatapath_rtl.py
@@ -0,0 +1,371 @@
+"""RTL implementation for RemoveDataPath custom operation.
+
+This module provides the RTL backend implementation for the RemoveDataPath
+custom operation, which removes data from the datapath while maintaining
+the control flow.
+"""
+
+import numpy as np
+import os
+from collections.abc import Sequence
+from numpy import ndarray
+from numpy import typing as npt
+from onnx import NodeProto
+from pathlib import Path
+from qonnx.core.datatype import BaseDataType, DataType
+from typing import Any, cast
+
+from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
+from finn.util.exception import FINNInternalError
+from finn.util.logging import log
+
+
+class RemoveDataPath_rtl(RTLBackend):
+    """RTL implementation for RemoveDataPath custom op."""
+
+    def __init__(self, onnx_node: NodeProto, **kwargs: Any) -> None:
+        """Initialize RemoveDataPath RTL backend.
+
+        Args:
+            onnx_node: The ONNX node proto for this operation.
+            **kwargs: Additional keyword arguments passed to parent class.
+
+        """
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self) -> dict:
+        """Return node attribute types for this custom operation.
+
+        Returns:
+            Dictionary mapping attribute names to their type specifications.
+
+        """
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                # folded shape of input/output
+                "folded_shape": ("ints", True, []),
+                # normal shape of input/output
+                "normal_shape": ("ints", True, []),
+                # FINN DataTypes for inputs/outputs
+                "dataType": ("s", True, ""),
+            }
+        )
+        return my_attrs
+
+    def infer_node_datatype(self, model: Any) -> None:
+        """Infer and set the output datatype based on input datatype.
+
+        Args:
+            model: The model wrapper containing this node.
+
+        """
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            log.warning(
+                f"inputDataType changing for {node.name}: {self.get_input_datatype()} -> {idt}"
+            )
+        self.set_nodeattr("dataType", idt.name)
+        # data type stays the same
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def get_rtl_file_list(self, abspath: bool = False) -> list[Path]:
+        """Return list of RTL files required for this custom operation.
+
+        Args:
+            abspath: Whether to return absolute paths (default: False).
+
+        Returns:
+            List of Path objects pointing to required RTL files.
+
+        Raises:
+            FINNInternalError: If code_gen_dir_ipgen or gen_top_module attributes are invalid.
+
+        """
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if abspath else ""
+
+        top_name = self.get_nodeattr("gen_top_module")
+        if type(code_gen_dir) is not str:
+            raise FINNInternalError(
+                f"code_gen_dir_ipgen attribute not set in {self.onnx_node.name}, "
+                "cannot get RTL file list"
+            )
+        if type(top_name) is not str or top_name == "":
+            raise FINNInternalError(
+                f"gen_top_module attribute not set in {self.onnx_node.name}, "
+                "cannot get RTL file list"
+            )
+
+        code_gen_dir_path = Path(code_gen_dir)
+
+        verilog_files = [
+            code_gen_dir_path / f"{top_name}.v",
+        ]
+        return verilog_files
+
+    def generate_hdl(self, model: Any, fpgapart: str, clk: str) -> None:  # noqa: ARG002
+        """Generate the RTL code for this custom op.
+
+        Args:
+            model: The model wrapper containing this node (unused).
+            fpgapart: Target FPGA part string (unused).
+            clk: Clock period in nanoseconds (unused).
+
+        Raises:
+            FINNInternalError: If code_gen_dir_ipgen attribute is invalid.
+
+        """
+        rtlsrc = Path(os.environ["FINN_RTLLIB"]) / "removedatapath" / "hdl"
+        template_path = rtlsrc / "dummy_template.v"
+
+        # save top module name so we can refer to it after this node has been renamed
+        # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject)
+        topname = self.get_verilog_top_module_name()
+        self.set_nodeattr("gen_top_module", topname)
+
+        # make instream width a multiple of 8 for axi interface
+        in_width = self.get_instream_width_padded()
+
+        code_gen_dict = {"$TOP_MODULE_NAME$": topname, "$WIDTH$": str(in_width)}
+
+        # apply code generation to templates
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        if type(code_gen_dir) is not str or code_gen_dir == "":
+            raise FINNInternalError(
+                f"code_gen_dir_ipgen attribute not set in {topname}, cannot generate RTL code"
+            )
+        with Path.open(template_path) as f:
+            template = f.read()
+
+        for placeholder, value in code_gen_dict.items():
+            template = template.replace(placeholder, value)
+
+        output_path = Path(code_gen_dir) / f"{self.get_verilog_top_module_name()}.v"
+        with Path.open(output_path, "w") as f:
+            f.write(template)
+
+        # set ipgen_path and ip_path so that HLS-Synth transformation
+        # and stich_ip transformation do not complain
+        # i.e. during the HLSSynthIP() transformation
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def code_generation_ipi(self) -> list[str]:
+        """Code generation for IP integration."""
+        sourcefiles = self.get_rtl_file_list(abspath=True)
+
+        cmd = []
+        for f in sourcefiles:
+            cmd += [f"add_files -norecurse {f}"]
+        cmd += [
+            "create_bd_cell -type module -reference "
+            f"{self.get_nodeattr('gen_top_module')} {self.onnx_node.name}"
+        ]
+        return cmd
+
+    def get_normal_input_shape(
+        self, ind: int = 0  # noqa: ARG002
+    ) -> Sequence[int] | npt.NDArray[np.int_]:
+        """Return the normal (unfolded) input shape.
+
+        Args:
+            ind: Input index (unused, kept for interface compatibility).
+
+        Returns:
+            The normal input shape dimensions.
+
+        Raises:
+            FINNInternalError: If normal_shape attribute is invalid or empty.
+
+        """
+        normal_shape = self.get_nodeattr("normal_shape")
+        if (
+            type(normal_shape) is not list
+            and type(normal_shape) is not tuple
+            and not isinstance(normal_shape, ndarray)
+        ):
+            raise FINNInternalError(
+                f"normal_shape attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get normal input shape"
+            )
+        if len(normal_shape) == 0:
+            raise FINNInternalError(
+                f"normal_shape attribute is empty in {self.onnx_node.name}, "
+                "cannot get normal input shape"
+            )
+        if type(normal_shape[0]) is not int:
+            raise FINNInternalError(
+                f"normal_shape attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get normal input shape"
+            )
+        return normal_shape
+
+    def get_normal_output_shape(
+        self, ind: int = 0  # noqa: ARG002
+    ) -> Sequence[int] | npt.NDArray[np.int_]:
+        """Return the normal (unfolded) output shape.
+
+        Args:
+            ind: Output index (unused, kept for interface compatibility).
+
+        Returns:
+            Tuple containing the normal output shape dimensions.
+
+        """
+        return self.get_normal_input_shape()
+
+    def get_folded_input_shape(
+        self, ind: int = 0  # noqa: ARG002
+    ) -> Sequence[int] | npt.NDArray[np.int_]:
+        """Return the folded input shape.
+
+        Args:
+            ind: Input index (unused, kept for interface compatibility).
+
+        Returns:
+            Tuple containing the folded input shape dimensions.
+
+        """
+        folded_shape = self.get_nodeattr("folded_shape")
+        if (
+            type(folded_shape) is not list
+            and type(folded_shape) is not tuple
+            and not isinstance(folded_shape, ndarray)
+        ):
+            raise FINNInternalError(
+                f"folded_shape attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get folded input shape"
+            )
+        if len(folded_shape) == 0:
+            raise FINNInternalError(
+                f"folded_shape attribute is empty in {self.onnx_node.name}, "
+                "cannot get folded input shape"
+            )
+        if type(folded_shape[0]) is not int:
+            raise FINNInternalError(
+                f"folded_shape attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get folded input shape"
+            )
+        return cast("Sequence[int]", folded_shape)
+
+    def get_folded_output_shape(
+        self, ind: int = 0  # noqa: ARG002
+    ) -> Sequence[int] | npt.NDArray[np.int_]:
+        """Return the folded output shape.
+
+        Args:
+            ind: Output index (unused, kept for interface compatibility).
+
+        Returns:
+            Tuple containing the folded output shape dimensions.
+
+        """
+        return self.get_folded_input_shape()
+
+    def get_instream_width(self, ind: int = 0) -> int:  # noqa: ARG002
+        """Return the input stream width in bits.
+
+        Args:
+            ind: Input index (unused, kept for interface compatibility).
+
+        Returns:
+            Input stream width in bits.
+
+        """
+        dtype = self.get_nodeattr("dataType")
+        if type(dtype) is not str:
+            raise FINNInternalError(
+                f"dataType attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get instream width"
+            )
+        dtype = DataType[dtype]
+        folded_shape = self.get_nodeattr("folded_shape")
+        if (
+            type(folded_shape) is not list
+            and type(folded_shape) is not tuple
+            and not isinstance(folded_shape, ndarray)
+        ):
+            raise FINNInternalError(
+                f"folded_shape attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get outstream width"
+            )
+        in_width = folded_shape[-1] * dtype.bitwidth()
+        return in_width
+
+    def get_outstream_width(self, ind: int = 0) -> int:  # noqa: ARG002
+        """Return the output stream width in bits.
+
+        Args:
+            ind: Output index (unused, kept for interface compatibility).
+
+        Returns:
+            Output stream width in bits.
+
+        Raises:
+            FINNInternalError: If dataType or folded_shape attributes are invalid.
+
+        """
+        dtype = self.get_nodeattr("dataType")
+        if type(dtype) is not str:
+            raise FINNInternalError(
+                f"dataType attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get outstream width"
+            )
+        dtype = DataType[dtype]
+        folded_shape = self.get_nodeattr("folded_shape")
+        if (
+            type(folded_shape) is not list
+            and type(folded_shape) is not tuple
+            and not isinstance(folded_shape, ndarray)
+        ):
+            raise FINNInternalError(
+                f"folded_shape attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get outstream width"
+            )
+        in_width = folded_shape[-1] * dtype.bitwidth()
+        return in_width
+
+    def get_input_datatype(self, ind: int = 0) -> BaseDataType:  # noqa: ARG002
+        """Return the input data type.
+
+        Args:
+            ind: Input index (unused, kept for interface compatibility).
+
+        Returns:
+            The QONNX data type for the input.
+
+        Raises:
+            FINNInternalError: If dataType attribute is invalid.
+
+        """
+        dtype = self.get_nodeattr("dataType")
+        if type(dtype) is not str:
+            raise FINNInternalError(
+                f"dataType attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get outstream width"
+            )
+        dtype = DataType[dtype]
+        return dtype
+
+    def get_output_datatype(self, ind: int = 0) -> BaseDataType:  # noqa: ARG002
+        """Return the output data type.
+
+        Args:
+            ind: Output index (unused, kept for interface compatibility).
+
+        Returns:
+            The QONNX data type for the output.
+
+        Raises:
+            FINNInternalError: If dataType attribute is invalid.
+
+        """
+        dtype = self.get_nodeattr("dataType")
+        if type(dtype) is not str:
+            raise FINNInternalError(
+                f"dataType attribute not set correctly in {self.onnx_node.name}, "
+                "cannot get outstream width"
+            )
+        dtype = DataType[dtype]
+        return dtype
diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py
index 96d3d9f116..d8a5d210b9 100644
--- a/src/finn/custom_op/fpgadataflow/rtlbackend.py
+++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py
@@ -26,76 +26,174 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+"""RTL backend support for FINN custom operations.
+
+This module provides the RTLBackend abstract base class that all RTL-based custom
+operations in FINN inherit from. It includes functionality for HDL code generation,
+RTL simulation, and integration with Vivado IP Integrator.
+"""
+
 import numpy as np
-import os
+import numpy.typing as npt
 from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import TYPE_CHECKING, cast
+
+if TYPE_CHECKING:
+    from onnx import GraphProto
+    from qonnx.core.modelwrapper import ModelWrapper
 
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 from finn import xsi
 from finn.util.basic import make_build_dir
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+from finn.util.exception import FINNInternalError
 from finn.util.logging import log
 
 finnxsi = xsi if xsi.is_available() else None
 
 
-class RTLBackend(ABC):
+class RTLBackend(HWCustomOp, ABC):
     """RTLBackend class all custom ops that correspond to a module in finn-rtllib
     are using functionality of. Contains different functions every RTL
     custom node should have. Some as abstract methods, these have to be filled
     when writing a new RTL custom op node."""
 
-    def get_nodeattr_types(self):
-        return {
-            # attribute to save top module name - not user configurable
-            "gen_top_module": ("s", False, ""),
-        }
+    def get_nodeattr_types(
+        self,
+    ) -> dict[
+        str,
+        tuple[str, bool, int | float | str | bool | npt.NDArray | list]
+        | tuple[str, bool, int | float | str | bool | npt.NDArray | list, set | None],
+    ]:
+        """Return 4-tuple (dtype, required, default_val, allowed_values) for attribute
+        with name. allowed_values will be None if not specified.
+
+        Returns:
+            dict[ str, tuple[str, bool, int | float | str | bool | npt.NDArray | list] | tuple[
+                str, bool, int | float | str | bool | npt.NDArray | list, set | None]]:
+                Dictionary of node attribute types
+        """
+        super_attrs = super().get_nodeattr_types()
+        super_attrs.update(
+            {
+                # attribute to save top module name - not user configurable
+                "gen_top_module": ("s", False, ""),
+            }
+        )
+        return super_attrs
 
     @abstractmethod
-    def generate_hdl(self, model, fpgapart, clk):
-        pass
+    def generate_hdl(self, model: "ModelWrapper", fpgapart: str, clk: str) -> None:
+        """Generate HDL code for this node.
+
+        Args:
+            model: The FINN model containing this node
+            fpgapart: Target FPGA part string
+            clk: Clock period specification
 
-    def prepare_rtlsim(self, behav=False):
-        """Creates a xsi emulation library for the RTL code generated
-        for this node, sets the rtlsim_so attribute to its path."""
+        Returns:
+            None
+        """
+
+    def prepare_rtlsim(self) -> None:
+        """Create a xsi emulation library for the RTL code generated for this node.
+        Sets the rtlsim_so attribute to the path of the generated library.
+
+        Returns:
+            None
+        """
+        import finn_xsi.adapter as finnxsi
 
         verilog_files = self.get_rtl_file_list(abspath=True)
         single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_")
         trace_file = self.get_nodeattr("rtlsim_trace")
         debug = not (trace_file is None or trace_file == "")
         ret = finnxsi.compile_sim_obj(
-            self.get_verilog_top_module_name(), verilog_files, single_src_dir, debug, behav
+            self.get_verilog_top_module_name(), verilog_files, single_src_dir, debug
         )
         # save generated lib filename in attribute
         self.set_nodeattr("rtlsim_so", ret[0] + "/" + ret[1])
 
-    def get_verilog_paths(self):
-        """Returns path to code gen directory. Can be overwritten to
-        return additional paths to relevant verilog files"""
+    def get_verilog_paths(self) -> list[str]:
+        """Return path to code gen directory.
+        Can be overwritten to return additional paths to relevant verilog files.
+
+        Returns:
+            list[str]: List of paths to directories containing Verilog files
+        """
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-        return [code_gen_dir]
+        return [cast("str", code_gen_dir)]
 
     @abstractmethod
-    def get_rtl_file_list(self, abspath=False):
-        """Returns list of rtl files. Needs to be filled by each node."""
-        pass
+    def get_rtl_file_list(self, abspath: bool = False) -> list[str] | list[Path]:
+        """Return list of RTL files.
+        Must be implemented by each subclass to provide the list of RTL files used by this node.
+
+        Args:
+            abspath: If True, return absolute paths; if False, return relative paths
+
+        Returns:
+            list[str] | list[Path]: List of paths to RTL files
+        """
 
     @abstractmethod
-    def code_generation_ipi(self):
-        pass
+    def code_generation_ipi(self) -> list[str]:
+        """Generate TCL commands for IP Integrator.
+        Must be implemented by each subclass to provide the TCL commands needed
+        to integrate this node into Vivado IP Integrator.
+
+        Returns:
+            list[str]: List of TCL commands for IP Integrator
+        """
 
-    def code_generation_ipgen(self, model, fpgapart, clk):
+    def code_generation_ipgen(self, model: "ModelWrapper", fpgapart: str, clk: str) -> None:
+        """Generate HDL code for IP generation.
+        Wrapper method that calls generate_hdl to produce the HDL code for this node.
+
+        Args:
+            model: The FINN model containing this node
+            fpgapart: Target FPGA part string
+            clk: Clock period specification
+
+        Returns:
+            None
+        """
         self.generate_hdl(model, fpgapart, clk)
 
-    def execute_node(self, context, graph):
+    def execute_node(
+        self, context: dict[str, npt.NDArray], graph: "GraphProto"
+    ) -> None:  # noqa: ARG002
+        """Execute this node's RTL simulation.
+
+        Args:
+            context: Dictionary mapping tensor names to their numpy array values
+            graph: The ONNX graph containing this node
+
+        Returns:
+            None
+
+        Raises:
+            Exception: If exec_mode is not set to "rtlsim"
+        """
         mode = self.get_nodeattr("exec_mode")
-        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        code_gen_dir = cast("str", self.get_nodeattr("code_gen_dir_ipgen"))
 
         if mode == "rtlsim":
             node = self.onnx_node
             inputs = {}
             for i, inp in enumerate(node.input):
-                exp_ishape = tuple(self.get_normal_input_shape(i))
+                shape = self.get_normal_input_shape(i)
+                if shape is None:
+                    raise FINNInternalError(
+                        f"Input shape for input {i} of node {node.name} is None."
+                    )
+                exp_ishape = tuple(shape)
                 folded_ishape = self.get_folded_input_shape(i)
+                if folded_ishape is None:
+                    raise FINNInternalError(
+                        f"Folded input shape for input {i} of node {node.name} is None."
+                    )
                 inp_val = context[inp]
                 # Make sure the input has the right container datatype
                 if inp_val.dtype != np.float32:
@@ -112,15 +210,14 @@ def execute_node(self, context, graph):
                 export_idt = self.get_input_datatype(i)
 
                 reshaped_input = inp_val.reshape(folded_ishape)
-                np.save(os.path.join(code_gen_dir, "input_%s.npy" % i), reshaped_input)
+                input_path = Path(code_gen_dir) / f"input_{i}.npy"
+                np.save(input_path, reshaped_input)
                 nbits = self.get_instream_width(i)
-                rtlsim_inp = npy_to_rtlsim_input(
-                    "{}/input_{}.npy".format(code_gen_dir, i), export_idt, nbits
-                )
-                inputs["in%s" % i] = rtlsim_inp
+                rtlsim_inp = npy_to_rtlsim_input(str(input_path), export_idt, nbits)
+                inputs[f"in{i}"] = rtlsim_inp
             outputs = {}
-            for o, outp in enumerate(node.output):
-                outputs["out%s" % o] = []
+            for o, _ in enumerate(node.output):
+                outputs[f"out{o}"] = []
             # assembled execution context
             io_dict = {"inputs": inputs, "outputs": outputs}
 
@@ -129,17 +226,22 @@ def execute_node(self, context, graph):
             self.rtlsim_multi_io(sim, io_dict)
             self.close_rtlsim(sim)
             for o, outp in enumerate(node.output):
-                rtlsim_output = io_dict["outputs"]["out%s" % o]
+                rtlsim_output = io_dict["outputs"][f"out{o}"]
                 odt = self.get_output_datatype(o)
                 target_bits = odt.bitwidth()
                 packed_bits = self.get_outstream_width(o)
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
+                out_npy_path = f"{code_gen_dir}/output.npy"
                 out_shape = self.get_folded_output_shape(o)
                 rtlsim_output_to_npy(
                     rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
                 )
                 # load and reshape output
-                exp_oshape = tuple(self.get_normal_output_shape(o))
+                oshape = self.get_normal_output_shape(o)
+                if oshape is None:
+                    raise FINNInternalError(
+                        f"Output shape for output {o} of node {node.name} is None."
+                    )
+                exp_oshape = tuple(oshape)
                 output = np.load(out_npy_path)
                 output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
                 context[outp] = output
@@ -150,8 +252,6 @@ def execute_node(self, context, graph):
 
         else:
             raise Exception(
-                """Invalid value for attribute exec_mode! Is currently set to: {}
-            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
-                    mode
-                )
+                f"""Invalid value for attribute exec_mode! Is currently set to: {mode}
+            has to be set to one of the following value ("cppsim", "rtlsim")"""
             )
diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py
index 026d5794e2..7eb2f4506e 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding.py
@@ -37,11 +37,12 @@
 from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.custom_op.fpgadataflow.memstream import MemStreamSupport
 from finn.util.exception import FINNInternalError
 from finn.util.logging import log
 
 
-class Thresholding(HWCustomOp):
+class Thresholding(MemStreamSupport, HWCustomOp):
     """Abstraction layer for HW implementation of Thresholding."""
 
     def __init__(self, onnx_node, **kwargs):
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index ec31c68330..2bef9bfc65 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -48,12 +48,13 @@
 )
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.custom_op.fpgadataflow.memstream import MemStreamSupport
 from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string
 from finn.util.logging import log
 from finn.util.settings import get_settings
 
 
-class VVAU(HWCustomOp):
+class VVAU(MemStreamSupport, HWCustomOp):
     """Abstraction layer for HW implementation of VectorVectorActivation layers."""
 
     def __init__(self, onnx_node, **kwargs):
diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py
index 0e6145063b..85708f3d11 100644
--- a/src/finn/interface/run_finn.py
+++ b/src/finn/interface/run_finn.py
@@ -934,7 +934,7 @@ def bench(
     # Late import because we need prepare_finn to setup remaining dependencies first
     from finn.benchmarking.bench import start_bench_run
 
-    exit_code = start_bench_run(bench_config)
+    exit_code = start_bench_run(str(bench_config))
     sys.exit(exit_code)
 
 
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 52729919d6..2d170ee84b 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -1,3 +1,9 @@
+"""Create stitched IP from FINN dataflow graph.
+
+This module provides transformations to create a Vivado IP Block Design project
+from generated IPs in a FINN dataflow graph.
+"""
+
 # Copyright (c) 2020, Xilinx, Inc.
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # All rights reserved.
@@ -27,31 +33,37 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-"""Transformation to create stitched IP from dataflow graph components."""
-
 import json
 import multiprocessing as mp
 import os
+from pathlib import Path
+from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.util.basic import get_num_default_workers
 from shutil import copytree
 from subprocess import CalledProcessError
+from typing import TYPE_CHECKING, Literal, cast
+
+if TYPE_CHECKING:
+    from onnx import NodeProto
 
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
 from finn.templates import get_templates_folder
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import ReplaceVerilogRelPaths
 from finn.util.basic import launch_process_helper, make_build_dir
-from finn.util.exception import FINNError, FINNUserError
+from finn.util.exception import FINNInternalError, FINNUserError
 from finn.util.fpgadataflow import is_hls_node, is_rtl_node
 from finn.util.logging import log
 
 
-def is_external_input(model, node, i):
-    """
-    Determine whether input i of node should be made external.
+def is_external_input(model: ModelWrapper, node: "NodeProto", i: int) -> bool:
+    """Check if input i of node should be made external.
 
-    True only if input is unconnected and has no initializer.
-    Only exception is second input of FC layers when mem_mode is external.
+    Returns True only if input is unconnected and has no initializer.
+    Exception: second input of FC layers when mem_mode is external.
     """
     node_inst = getCustomOp(node)
     op_type = node.op_type
@@ -59,21 +71,19 @@ def is_external_input(model, node, i):
     if producer is None:
         if model.get_initializer(node.input[i]) is None:
             return True
-        else:
-            if op_type.startswith("MVAU"):
-                if node_inst.get_nodeattr("mem_mode") == "external":
-                    return True
+        if op_type.startswith("MVAU") and node_inst.get_nodeattr("mem_mode") == "external":
+            return True
     return False
 
 
-def is_external_output(model, node, i):
-    """Determine whether output i of node should be made external."""
+def is_external_output(model: ModelWrapper, node: "NodeProto", i: int) -> bool:
+    """Check if output i of node should be made external.
+
+    Returns True only if output is unconnected.
+    """
+    # TODO should ideally check if tensor is in top-level outputs
     consumers = model.find_consumers(node.output[i])
-    if consumers == []:
-        # TODO should ideally check if tensor is in top-level
-        # outputs
-        return True
-    return False
+    return consumers == []
 
 
 class CreateStitchedIP(Transformation):
@@ -90,14 +100,34 @@ class CreateStitchedIP(Transformation):
     The packaged block design IP can be found under the ip subdirectory.
     """
 
-    def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signature=[]):
-        """Initialize CreateStitchedIP transformation with FPGA part and clock settings."""
+    def __init__(
+        self,
+        fpgapart: str,
+        clk_ns: float,
+        ip_name: str = "finn_design",
+        vitis: bool = False,
+        signature: list | None = None,
+        functional_simulation: bool = False,
+    ) -> None:
+        """Initialize CreateStitchedIP transformation.
+
+        Args:
+            fpgapart: FPGA part identifier
+            clk_ns: Clock period in nanoseconds
+            ip_name: Name for the IP design
+            vitis: Whether to target Vitis
+            signature: Optional signature list [customer, application, version]
+            functional_simulation: Whether to generate functional simulation wrapper
+        """
+        if signature is None:
+            signature = []
         super().__init__()
         self.fpgapart = fpgapart
         self.clk_ns = clk_ns
         self.ip_name = ip_name
         self.vitis = vitis
         self.signature = signature
+        self.functional_simulation = functional_simulation
         self.has_aximm = False
         self.has_m_axis = False
         self.m_axis_idx = 0
@@ -118,208 +148,221 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu
             "ap_none": [],
         }
 
-    def is_double_pumped(self, node):
-        """Check if node uses double pumped computation."""
+    def is_double_pumped(self, node: "NodeProto") -> bool:
+        """Check if node uses double-pumped compute or memory."""
         if node.op_type.startswith("MVAU"):
             inst = getCustomOp(node)
             try:
-                pumped_compute = inst.get_nodeattr("pumpedCompute")
+                pumped_compute = cast("int", inst.get_nodeattr("pumpedCompute"))
             except AttributeError:
                 pumped_compute = 0
-            return pumped_compute or inst.get_nodeattr("pumpedMemory")
+            return bool(pumped_compute or cast("int", inst.get_nodeattr("pumpedMemory")))
+        return False
 
-    def connect_clk_rst(self, node):
-        """Connect clock and reset signals for the node."""
+    def connect_clk_rst(self, node: "NodeProto") -> None:
+        """Connect clock and reset signals for a node."""
         inst_name = node.name
         node_inst = getCustomOp(node)
+        if not isinstance(node_inst, HWCustomOp):
+            raise FINNInternalError(
+                f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces."
+            )
         clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0]
         reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0]
+
         # make clock and reset external, if they aren't already
         if not self.clock_reset_are_external:
-            self.connect_cmds.append(
-                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock_intf_name)
-            )
-            self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]")
-            self.connect_cmds.append(
-                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, reset_intf_name)
+            self.connect_cmds.extend(
+                [
+                    f"make_bd_pins_external [get_bd_pins {inst_name}/{clock_intf_name}]",
+                    "set_property name ap_clk [get_bd_ports ap_clk_0]",
+                    f"make_bd_pins_external [get_bd_pins {inst_name}/{reset_intf_name}]",
+                    "set_property name ap_rst_n [get_bd_ports ap_rst_n_0]",
+                ]
             )
-            self.connect_cmds.append("set_property name ap_rst_n [get_bd_ports ap_rst_n_0]")
             self.clock_reset_are_external = True
             self.intf_names["clk"] = ["ap_clk"]
             self.intf_names["rst"] = ["ap_rst_n"]
         # otherwise connect clock and reset
         else:
-            self.connect_cmds.append(
-                "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]"
-                % (inst_name, reset_intf_name)
-            )
-            self.connect_cmds.append(
-                "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]"
-                % (inst_name, clock_intf_name)
+            self.connect_cmds.extend(
+                [
+                    f"connect_bd_net [get_bd_ports ap_rst_n] "
+                    f"[get_bd_pins {inst_name}/{reset_intf_name}]",
+                    f"connect_bd_net [get_bd_ports ap_clk] "
+                    f"[get_bd_pins {inst_name}/{clock_intf_name}]",
+                ]
             )
+
         # make clk2x external, if it isn't already and connect clk2x
         if self.is_double_pumped(node):
             clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0]
             if not self.clock2x_is_external:
-                self.connect_cmds.append(
-                    "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name)
+                self.connect_cmds.extend(
+                    [
+                        f"make_bd_pins_external [get_bd_pins {inst_name}/{clock2x_intf_name}]",
+                        "set_property name ap_clk2x [get_bd_ports ap_clk2x_0]",
+                    ]
                 )
-                self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]")
                 self.clock2x_is_external = True
                 self.intf_names["clk2x"] = ["ap_clk2x"]
             # otherwise connect clk2x
             else:
                 if self.is_double_pumped(node):
                     self.connect_cmds.append(
-                        "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]"
-                        % (inst_name, clock2x_intf_name)
+                        f"connect_bd_net [get_bd_ports ap_clk2x] "
+                        f"[get_bd_pins {inst_name}/{clock2x_intf_name}]"
                     )
 
-    def connect_axi(self, node):
-        """Connect AXI interfaces for the node."""
+    def connect_axi(self, node: "NodeProto") -> None:
+        """Connect AXI-Lite and AXI-MM interfaces for a node."""
         inst_name = node.name
         node_inst = getCustomOp(node)
+        if not isinstance(node_inst, HWCustomOp):
+            raise FINNInternalError(
+                f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces."
+            )
         axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"]
         aximm_intf_name = node_inst.get_verilog_top_module_intf_names()["aximm"]
+
         if len(axilite_intf_name) != 0:
             self.connect_cmds.append(
-                "make_bd_intf_pins_external "
-                "[get_bd_intf_pins %s/%s]" % (inst_name, axilite_intf_name[0])
-            )
-            ext_if_name = "%s_%d" % (
-                axilite_intf_name[0],
-                len(self.intf_names["axilite"]),
+                f"make_bd_intf_pins_external [get_bd_intf_pins {inst_name}/{axilite_intf_name[0]}]"
             )
+            ext_if_name = f"{axilite_intf_name[0]}_{len(self.intf_names['axilite'])}"
             self.intf_names["axilite"].append(ext_if_name)
+
         if len(aximm_intf_name) != 0:
-            self.connect_cmds.append(
-                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
-                % (inst_name, aximm_intf_name[0][0])
-            )
-            ext_if_name = "m_axi_gmem%d" % (len(self.intf_names["aximm"]))
-            self.connect_cmds.append(
-                "set_property name %s [get_bd_intf_ports m_axi_gmem_0]" % ext_if_name
+            ext_if_name = f"m_axi_gmem{len(self.intf_names['aximm'])}"
+            seg_name = f"{inst_name}/Data_m_axi_gmem/SEG_{ext_if_name}_Reg"
+
+            self.connect_cmds.extend(
+                [
+                    f"make_bd_intf_pins_external "
+                    f"[get_bd_intf_pins {inst_name}/{aximm_intf_name[0][0]}]",
+                    f"set_property name {ext_if_name} [get_bd_intf_ports m_axi_gmem_0]",
+                    "assign_bd_address",
+                    f"set_property offset 0 [get_bd_addr_segs {{{seg_name}}}]",
+                    f"set_property range 4G [get_bd_addr_segs {{{seg_name}}}]",
+                ]
             )
-            self.connect_cmds.append("assign_bd_address")
-            seg_name = "%s/Data_m_axi_gmem/SEG_%s_Reg" % (inst_name, ext_if_name)
-            self.connect_cmds.append("set_property offset 0 [get_bd_addr_segs {%s}]" % (seg_name))
-            # TODO should propagate this information from the node instead of 4G
-            self.connect_cmds.append("set_property range 4G [get_bd_addr_segs {%s}]" % (seg_name))
+
             self.intf_names["aximm"] = [(ext_if_name, aximm_intf_name[0][1])]
             self.has_aximm = True
 
-    def connect_m_axis_external(self, node, idx=None):
-        """Connect master AXI stream interfaces as external ports."""
+    def connect_m_axis_external(self, node: "NodeProto", idx: int | None = None) -> None:
+        """Make AXI Stream master interface(s) external."""
         inst_name = node.name
         node_inst = getCustomOp(node)
+        if not isinstance(node_inst, HWCustomOp):
+            raise FINNInternalError(
+                f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces."
+            )
         output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"]
+
         # make output axis external
         for i in range(len(output_intf_names)):
             if idx is not None and idx != i:
                 continue
             output_intf_name = output_intf_names[i][0]
-            self.connect_cmds.append(
-                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]"
-                % (inst_name, output_intf_name)
-            )
-            self.connect_cmds.append(
-                "set_property name m_axis_%d [get_bd_intf_ports %s_0]"
-                % (self.m_axis_idx, output_intf_name)
+
+            self.connect_cmds.extend(
+                [
+                    f"make_bd_intf_pins_external [get_bd_intf_pins {inst_name}/{output_intf_name}]",
+                    f"set_property name m_axis_{self.m_axis_idx} "
+                    f"[get_bd_intf_ports {output_intf_name}_0]",
+                ]
             )
+
             self.has_m_axis = True
-            self.intf_names["m_axis"].append(
-                ("m_axis_%d" % self.m_axis_idx, output_intf_names[i][1])
-            )
+            self.intf_names["m_axis"].append((f"m_axis_{self.m_axis_idx}", output_intf_names[i][1]))
             self.m_axis_idx += 1
 
-    def connect_s_axis_external(self, node, idx=None):
-        """Connect slave AXI stream interfaces as external ports."""
+    def connect_s_axis_external(self, node: "NodeProto", idx: int | None = None) -> None:
+        """Make AXI Stream slave interface(s) external."""
         inst_name = node.name
         node_inst = getCustomOp(node)
+        if not isinstance(node_inst, HWCustomOp):
+            raise FINNInternalError(
+                f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces."
+            )
         input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"]
+
         # make input axis external
         for i in range(len(input_intf_names)):
             if idx is not None and idx != i:
                 continue
             input_intf_name = input_intf_names[i][0]
-            self.connect_cmds.append(
-                "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" % (inst_name, input_intf_name)
-            )
-            self.connect_cmds.append(
-                "set_property name s_axis_%d [get_bd_intf_ports %s_0]"
-                % (self.s_axis_idx, input_intf_name)
+
+            self.connect_cmds.extend(
+                [
+                    f"make_bd_intf_pins_external [get_bd_intf_pins {inst_name}/{input_intf_name}]",
+                    f"set_property name s_axis_{self.s_axis_idx} "
+                    f"[get_bd_intf_ports {input_intf_name}_0]",
+                ]
             )
+
             self.has_s_axis = True
-            self.intf_names["s_axis"].append(
-                ("s_axis_%d" % self.s_axis_idx, input_intf_names[i][1])
-            )
+            self.intf_names["s_axis"].append((f"s_axis_{self.s_axis_idx}", input_intf_names[i][1]))
             self.s_axis_idx += 1
 
-    def connect_ap_none_external(self, node):
-        """Connect ap_none interfaces as external ports."""
+    def connect_ap_none_external(self, node: "NodeProto") -> None:
+        """Make ap_none interfaces external."""
         inst_name = node.name
         node_inst = getCustomOp(node)
+        if not isinstance(node_inst, HWCustomOp):
+            raise FINNInternalError(
+                f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces."
+            )
         input_intf_names = node_inst.get_verilog_top_module_intf_names()["ap_none"]
+
         # make external
         for i in range(len(input_intf_names)):
             input_intf_name = input_intf_names[i]
-            self.connect_cmds.append(
-                "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, input_intf_name)
-            )
-            self.connect_cmds.append(
-                "set_property name %s [get_bd_ports %s_0]" % (input_intf_name, input_intf_name)
+            self.connect_cmds.extend(
+                [
+                    f"make_bd_pins_external [get_bd_pins {inst_name}/{input_intf_name}]",
+                    f"set_property name {input_intf_name} [get_bd_ports {input_intf_name}_0]",
+                ]
             )
             self.intf_names["ap_none"].append(input_intf_name)
 
-    def insert_signature(self, checksum_count):
-        """Insert signature block for design identification."""
+    def insert_signature(self, checksum_count: int) -> None:
+        """Insert AXI info signature component into the design."""
         signature_vlnv = "AMD:user:axi_info_top:1.0"
         signature_name = "axi_info_top0"
-        self.create_cmds.append(
-            "create_bd_cell -type ip -vlnv %s %s" % (signature_vlnv, signature_name)
-        )
-        self.create_cmds.append(
-            "set_property -dict [list "
-            "CONFIG.SIG_CUSTOMER {%s} "
-            "CONFIG.SIG_APPLICATION {%s} "
-            "CONFIG.VERSION {%s} "
-            "CONFIG.CHECKSUM_COUNT {%s} "
-            "] [get_bd_cells %s]"
-            % (
-                self.signature[0],
-                self.signature[1],
-                self.signature[2],
-                checksum_count,
-                signature_name,
-            )
-        )
-        # set clk and reset
-        self.connect_cmds.append(
-            "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/ap_clk]" % signature_name
-        )
-        self.connect_cmds.append(
-            "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/ap_rst_n]" % signature_name
-        )
         fclk_mhz = 1 / (self.clk_ns * 0.001)
         fclk_hz = fclk_mhz * 1000000
-        self.connect_cmds.append(
-            "set_property -dict [list "
-            "CONFIG.FREQ_HZ {%f} "
-            "CONFIG.CLK_DOMAIN {ap_clk} "
-            "] [get_bd_intf_pins %s/s_axi]"
-            % (
-                fclk_hz,
-                signature_name,
-            )
+
+        # Create signature cell and configure properties
+        self.create_cmds.extend(
+            [
+                f"create_bd_cell -type ip -vlnv {signature_vlnv} {signature_name}",
+                f"set_property -dict [list "
+                f"CONFIG.SIG_CUSTOMER {{{self.signature[0]}}} "
+                f"CONFIG.SIG_APPLICATION {{{self.signature[1]}}} "
+                f"CONFIG.VERSION {{{self.signature[2]}}} "
+                f"CONFIG.CHECKSUM_COUNT {{{checksum_count}}} "
+                f"] [get_bd_cells {signature_name}]",
+            ]
         )
-        # make axilite interface external
-        self.connect_cmds.append(
-            "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name
+
+        # Connect clocks, resets and configure AXI interface
+        self.connect_cmds.extend(
+            [
+                f"connect_bd_net [get_bd_ports ap_clk] [get_bd_pins {signature_name}/ap_clk]",
+                f"connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins {signature_name}/ap_rst_n]",
+                f"set_property -dict [list "
+                f"CONFIG.FREQ_HZ {{{fclk_hz}}} "
+                f"CONFIG.CLK_DOMAIN {{ap_clk}} "
+                f"] [get_bd_intf_pins {signature_name}/s_axi]",
+                f"make_bd_intf_pins_external [get_bd_intf_pins {signature_name}/s_axi]",
+                "set_property name s_axilite_info [get_bd_intf_ports s_axi_0]",
+                "assign_bd_address",
+            ]
         )
-        self.connect_cmds.append("set_property name s_axilite_info [get_bd_intf_ports s_axi_0]")
-        self.connect_cmds.append("assign_bd_address")
 
-    def apply(self, model):
+    def apply(self, model: "ModelWrapper") -> tuple[ModelWrapper, Literal[False]]:
         """Apply the CreateStitchedIP transformation to the model."""
         # ensure non-relative readmemh .dat files
         model = model.transform(ReplaceVerilogRelPaths())
@@ -328,7 +371,10 @@ def apply(self, model):
         ip_dirs.append("$::env(FINN_RTLLIB)/memstream")
         if self.signature:
             ip_dirs.append("$::env(FINN_RTLLIB)/axi_info")
-        if model.graph.node[0].op_type not in ["StreamingFIFO_rtl", "IODMA_hls"]:
+        if (
+            model.graph.node[0].op_type not in ["StreamingFIFO_rtl", "IODMA_hls"]
+            and self.functional_simulation is False
+        ):
             log.warning(
                 """First node is not StreamingFIFO or IODMA.
                 You may experience incorrect stitched-IP rtlsim or hardware
@@ -345,12 +391,23 @@ def apply(self, model):
                 )
         for node in model.graph.node:
             # ensure that all nodes are fpgadataflow, and that IPs are generated
-            assert is_hls_node(node) or is_rtl_node(
-                node
-            ), "All nodes must be FINN fpgadataflow nodes."
+            if not is_hls_node(node) and not is_rtl_node(node):
+                raise FINNUserError(
+                    f"{node.name} is not an fpgadataflow node. Aborting stitching IP."
+                )
             node_inst = getCustomOp(node)
+            if not isinstance(node_inst, RTLBackend) and not isinstance(node_inst, HLSBackend):
+                raise FINNInternalError(
+                    f"Node {node.name} is not an RTL Node or HLS Node, "
+                    "cannot connect AXI interfaces."
+                )
             ip_dir_value = node_inst.get_nodeattr("ip_path")
-            assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist."
+            if type(ip_dir_value) is not str or ip_dir_value == "":
+                raise FINNInternalError(f"ip_path has the wrong type in node {node.name}.")
+            if not Path(ip_dir_value).is_dir():
+                raise FINNInternalError(
+                    f"IP generation directory doesn't exist in node {node.name}."
+                )
             ip_dirs += [ip_dir_value]
             self.create_cmds += node_inst.code_generation_ipi()
             self.connect_clk_rst(node)
@@ -362,22 +419,25 @@ def apply(self, model):
                     if producer is None:
                         continue
                     j = list(producer.output).index(node.input[i])
-                    src_intf_name = getCustomOp(producer).get_verilog_top_module_intf_names()[
-                        "m_axis"
-                    ][j][0]
+                    prod = getCustomOp(producer)
+                    if not isinstance(prod, HWCustomOp):
+                        raise FINNInternalError(
+                            f"Producer node {producer.name} is not an HWCustomOp, "
+                            "cannot connect AXI interfaces."
+                        )
+                    src_intf_name = prod.get_verilog_top_module_intf_names()["m_axis"][j][0]
                     dst_intf_name = node_inst.get_verilog_top_module_intf_names()["s_axis"][i][0]
                     self.connect_cmds.append(
-                        "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
-                        "[get_bd_intf_pins %s/%s]"
-                        % (producer.name, src_intf_name, node.name, dst_intf_name)
+                        f"connect_bd_intf_net [get_bd_intf_pins {producer.name}/{src_intf_name}] "
+                        f"[get_bd_intf_pins {node.name}/{dst_intf_name}]"
                     )
 
         # process external inputs and outputs in top-level graph input order
-        for input in model.graph.input:
-            inp_name = input.name
+        for graph_input in model.graph.input:
+            inp_name = graph_input.name
             inp_cons = model.find_consumers(inp_name)
-            assert inp_cons != [], "No consumer for input " + inp_name
-            assert len(inp_cons) == 1, "Multiple consumers for input " + inp_name
+            assert inp_cons != [], f"No consumer for input {inp_name}"
+            assert len(inp_cons) == 1, f"Multiple consumers for input {inp_name}"
             node = inp_cons[0]
             node_inst = getCustomOp(node)
             for i in range(len(node.input)):
@@ -386,7 +446,7 @@ def apply(self, model):
         for output in model.graph.output:
             out_name = output.name
             node = model.find_producer(out_name)
-            assert node is not None, "No producer for output " + out_name
+            assert node is not None, f"No producer for output {out_name}"
             node_inst = getCustomOp(node)
             for i in range(len(node.output)):
                 if node.output[i] == out_name:
@@ -403,147 +463,171 @@ def apply(self, model):
         model.set_metadata_prop("vivado_stitch_proj", vivado_stitch_proj_dir)
         # start building the tcl script
         tcl = []
-        # create vivado project
-        tcl.append(
-            "create_project %s %s -part %s" % (prjname, vivado_stitch_proj_dir, self.fpgapart)
-        )
-        # no warnings on long module names
-        tcl.append("set_msg_config -id {[BD 41-1753]} -suppress")
-        # add all the generated IP dirs to ip_repo_paths
+
+        # Project setup
         ip_dirs_str = " ".join(ip_dirs)
-        tcl.append("set_property ip_repo_paths [%s] [current_project]" % ip_dirs_str)
-        tcl.append("update_ip_catalog")
-        # create block design and instantiate all layers
         block_name = self.ip_name
-        tcl.append('create_bd_design "%s"' % block_name)
+
+        tcl.extend(
+            [
+                f"create_project {prjname} {vivado_stitch_proj_dir} -part {self.fpgapart}",
+                "set_msg_config -id {[BD 41-1753]} -suppress",
+                f"set_property ip_repo_paths [{ip_dirs_str}] [current_project]",
+                "update_ip_catalog",
+                f'create_bd_design "{block_name}"',
+            ]
+        )
+        # Add commands and validate design
         tcl.extend(self.create_cmds)
         tcl.extend(self.connect_cmds)
+
         fclk_mhz = 1 / (self.clk_ns * 0.001)
         fclk_hz = fclk_mhz * 1000000
-        tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz))
+
+        # Configure clocks and validate design
+        clock_config = [f"set_property CONFIG.FREQ_HZ {round(fclk_hz)} [get_bd_ports /ap_clk]"]
         if self.clock2x_is_external:
-            tcl.append(
-                "set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk2x]" % round(2 * fclk_hz)
+            clock_config.append(
+                f"set_property CONFIG.FREQ_HZ {round(2 * fclk_hz)} [get_bd_ports /ap_clk2x]"
             )
-        tcl.append("validate_bd_design")
-        tcl.append("save_bd_design")
-        # create wrapper hdl (for rtlsim later on)
-        bd_base = "%s/%s.srcs/sources_1/bd/%s" % (
-            vivado_stitch_proj_dir,
-            prjname,
-            block_name,
+
+        clock_config.extend(["validate_bd_design", "save_bd_design"])
+
+        tcl.extend(clock_config)
+
+        # Create wrapper HDL
+        bd_base = f"{vivado_stitch_proj_dir}/{prjname}.srcs/sources_1/bd/{block_name}"
+        bd_filename = f"{bd_base}/{block_name}.bd"
+        wrapper_filename = f"{bd_base}/hdl/{block_name}_wrapper.v"
+
+        tcl.extend(
+            [
+                f"make_wrapper -files [get_files {bd_filename}] -top",
+                f"add_files -norecurse {wrapper_filename}",
+                f"set_property top {block_name}_wrapper [current_fileset]",
+            ]
         )
-        bd_filename = "%s/%s.bd" % (bd_base, block_name)
-        tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename)
-        wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name)
-        tcl.append("add_files -norecurse %s" % wrapper_filename)
+
         model.set_metadata_prop("wrapper_filename", wrapper_filename)
-        tcl.append("set_property top %s_wrapper [current_fileset]" % block_name)
-        # synthesize to DCP and export stub, DCP and constraints
+        num_workers = get_num_default_workers()
+        assert num_workers >= 0, "Number of workers must be nonnegative."
+        if num_workers == 0:
+            num_workers = mp.cpu_count()
+
+        fifosim_wrapper_filename = None
+        if self.functional_simulation:
+            bd_base_sim = f"{vivado_stitch_proj_dir}/{prjname}.sim/sim_1/synth/func/xsim/"
+            fifosim_wrapper_filename = f"{bd_base_sim}/fifosim_wrapper_func_synth.v"
+
+            tcl.extend(
+                [
+                    f"launch_runs synth_1 -jobs {num_workers}",
+                    "wait_on_run [get_runs synth_1]",
+                    "open_run synth_1 -name synth_1",
+                    "opt_design",
+                    # "opt_design -muxf_remap -carry_remap -control_set_merge "
+                    # "-merge_equivalent_drivers -mbufg_opt -dsp_register_opt "
+                    # "-control_set_opt -remap -resynth_area -resynth_remap",
+                    # "opt_design",
+                    f"write_verilog -mode funcsim -force -file {fifosim_wrapper_filename}",
+                ]
+            )
+
+            model.set_metadata_prop("wrapper_filename", fifosim_wrapper_filename)
+        # Synthesize to DCP and export stub, DCP and constraints
         if self.vitis:
-            tcl.append(
-                "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]" % bd_filename
-            )
-            tcl.append(
-                "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} "
-                "-value {-mode out_of_context} -objects [get_runs synth_1]"
-            )
-            num_workers = get_num_default_workers()
-            assert num_workers >= 0, "Number of workers must be nonnegative."
-            if num_workers == 0:
-                num_workers = mp.cpu_count()
-            tcl.append("launch_runs synth_1 -jobs %s" % str(num_workers))
-            tcl.append("wait_on_run [get_runs synth_1]")
-            tcl.append("open_run synth_1 -name synth_1")
-            tcl.append("write_verilog -force -mode synth_stub %s.v" % block_name)
-            tcl.append("write_checkpoint %s.dcp" % block_name)
-            tcl.append("write_xdc %s.xdc" % block_name)
-            tcl.append(
-                "report_utilization -hierarchical -hierarchical_depth 5 "
-                "-file %s_partition_util.rpt" % block_name
-            )
-        # export block design itself as an IP core
+            tcl.extend(
+                [
+                    f"set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files {bd_filename} ]",
+                    "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} "
+                    "-value {-mode out_of_context} -objects [get_runs synth_1]",
+                    f"launch_runs synth_1 -jobs {num_workers}",
+                    "wait_on_run [get_runs synth_1]",
+                    "open_run synth_1 -name synth_1",
+                    f"write_verilog -force -mode synth_stub {block_name}.v",
+                    f"write_checkpoint {block_name}.dcp",
+                    f"write_xdc {block_name}.xdc",
+                    f"report_utilization -hierarchical -hierarchical_depth 5 "
+                    f"-file {block_name}_partition_util.rpt",
+                ]
+            )
+        # Export block design itself as an IP core
         block_vendor = "xilinx_finn"
         block_library = "finn"
-        block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name)
+        block_vlnv = f"{block_vendor}:{block_library}:{block_name}:1.0"
         model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv)
         model.set_metadata_prop("vivado_stitch_ifnames", json.dumps(self.intf_names))
-        tcl.append(
-            (
-                "ipx::package_project -root_dir %s/ip -vendor %s "
-                "-library %s -taxonomy /UserIP -module %s -import_files"
-            )
-            % (vivado_stitch_proj_dir, block_vendor, block_library, block_name)
-        )
-        # Allow user to customize clock in deployment of stitched IP
-        tcl.append("set_property ipi_drc {ignore_freq_hz true} [ipx::current_core]")
-        # in some cases, the IP packager seems to infer an aperture of 64K or 4G,
-        # preventing address assignment of the DDR_LOW and/or DDR_HIGH segments
-        # the following is a hotfix to remove this aperture during IODMA packaging
-        tcl.append(
-            "ipx::remove_segment -quiet m_axi_gmem0:APERTURE_0 "
-            "[ipx::get_address_spaces m_axi_gmem0 -of_objects [ipx::current_core]]"
-        )
-        tcl.append("set_property core_revision 2 [ipx::find_open_core %s]" % block_vlnv)
-        tcl.append("ipx::create_xgui_files [ipx::find_open_core %s]" % block_vlnv)
-        # mark bus interface params as user-resolvable to avoid FREQ_MHZ mismatches
-        tcl.append(
-            "set_property value_resolve_type user [ipx::get_bus_parameters "
-            "-of [ipx::get_bus_interfaces -of [ipx::current_core ]]]"
+
+        # Package IP and configure properties
+        tcl.extend(
+            [
+                f"ipx::package_project -root_dir {vivado_stitch_proj_dir}/ip "
+                f"-vendor {block_vendor} -library {block_library} -taxonomy /UserIP "
+                f"-module {block_name} -import_files",
+                "set_property ipi_drc {ignore_freq_hz true} [ipx::current_core]",
+                "ipx::remove_segment -quiet m_axi_gmem0:APERTURE_0 "
+                "[ipx::get_address_spaces m_axi_gmem0 -of_objects [ipx::current_core]]",
+                f"set_property core_revision 2 [ipx::find_open_core {block_vlnv}]",
+                f"ipx::create_xgui_files [ipx::find_open_core {block_vlnv}]",
+                "set_property value_resolve_type user [ipx::get_bus_parameters "
+                "-of [ipx::get_bus_interfaces -of [ipx::current_core ]]]",
+            ]
         )
-        # if targeting Vitis, add some properties to the IP
+        # If targeting Vitis, add some properties to the IP
         if self.vitis:
-            # replace source code with dcp
-            tcl.append("set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv)
-            tcl.append("set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv)
-            tcl.append("set_property supported_families { } [ipx::find_open_core %s]" % block_vlnv)
-            tcl.append(
-                "set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} "
-                "[ipx::find_open_core %s]" % block_vlnv
-            )
-            tcl.append(
-                "set_property auto_family_support_level level_2 "
-                "[ipx::find_open_core %s]" % block_vlnv
-            )
-            # remove all files from synthesis and sim groups
-            # we'll replace with DCP, stub, and xdc
-            tcl.append(
-                "ipx::remove_all_file "
-                "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]"
-            )
-            tcl.append("ipx::remove_all_file " "[ipx::get_file_groups xilinx_anylanguagesynthesis]")
-            tcl.append(
-                "ipx::remove_file_group "
-                "xilinx_anylanguagebehavioralsimulation [ipx::current_core]"
-            )
-            tcl.append("ipx::remove_file_group " "xilinx_anylanguagesynthesis [ipx::current_core]")
-            # remove sim and src folders
-            tcl.append("file delete -force %s/ip/sim" % vivado_stitch_proj_dir)
-            tcl.append("file delete -force %s/ip/src" % vivado_stitch_proj_dir)
-            # copy and add DCP, stub, and xdc
-            tcl.append("file mkdir %s/ip/dcp" % vivado_stitch_proj_dir)
-            tcl.append("file mkdir %s/ip/impl" % vivado_stitch_proj_dir)
-            tcl.append("file copy -force %s.dcp %s/ip/dcp" % (block_name, vivado_stitch_proj_dir))
-            tcl.append("file copy -force %s.xdc %s/ip/impl" % (block_name, vivado_stitch_proj_dir))
-            tcl.append("ipx::add_file_group xilinx_implementation [ipx::current_core]")
-            tcl.append(
-                "ipx::add_file impl/%s.xdc [ipx::get_file_groups xilinx_implementation]"
-                % block_name
-            )
-            tcl.append(
-                "set_property used_in [list implementation] "
-                "[ipx::get_files impl/%s.xdc "
-                "-of_objects [ipx::get_file_groups xilinx_implementation]]" % block_name
-            )
-            tcl.append("ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]")
-            tcl.append(
-                "ipx::add_file dcp/%s.dcp "
-                "[ipx::get_file_groups xilinx_synthesischeckpoint]" % block_name
-            )
-            tcl.append("ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]")
-            tcl.append(
-                "ipx::add_file dcp/%s.dcp "
-                "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name
+            # Configure Vitis kernel properties
+            tcl.extend(
+                [
+                    f"set_property sdx_kernel true [ipx::find_open_core {block_vlnv}]",
+                    f"set_property sdx_kernel_type rtl [ipx::find_open_core {block_vlnv}]",
+                    f"set_property supported_families {{}} [ipx::find_open_core {block_vlnv}]",
+                    f"set_property xpm_libraries {{XPM_CDC XPM_MEMORY XPM_FIFO}} "
+                    f"[ipx::find_open_core {block_vlnv}]",
+                    f"set_property auto_family_support_level level_2 "
+                    f"[ipx::find_open_core {block_vlnv}]",
+                ]
+            )
+
+            # Remove all files from synthesis and sim groups and replace with DCP
+            tcl.extend(
+                [
+                    "ipx::remove_all_file "
+                    "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]",
+                    "ipx::remove_all_file [ipx::get_file_groups xilinx_anylanguagesynthesis]",
+                    "ipx::remove_file_group "
+                    "xilinx_anylanguagebehavioralsimulation [ipx::current_core]",
+                    "ipx::remove_file_group xilinx_anylanguagesynthesis [ipx::current_core]",
+                ]
+            )
+
+            # Setup file structure for DCP-based IP
+            tcl.extend(
+                [
+                    f"file delete -force {vivado_stitch_proj_dir}/ip/sim",
+                    f"file delete -force {vivado_stitch_proj_dir}/ip/src",
+                    f"file mkdir {vivado_stitch_proj_dir}/ip/dcp",
+                    f"file mkdir {vivado_stitch_proj_dir}/ip/impl",
+                    f"file copy -force {block_name}.dcp {vivado_stitch_proj_dir}/ip/dcp",
+                    f"file copy -force {block_name}.xdc {vivado_stitch_proj_dir}/ip/impl",
+                ]
+            )
+
+            # Add implementation and checkpoint file groups
+            tcl.extend(
+                [
+                    "ipx::add_file_group xilinx_implementation [ipx::current_core]",
+                    f"ipx::add_file impl/{block_name}.xdc "
+                    "[ipx::get_file_groups xilinx_implementation]",
+                    f"set_property used_in [list implementation] "
+                    f"[ipx::get_files impl/{block_name}.xdc "
+                    f"-of_objects [ipx::get_file_groups xilinx_implementation]]",
+                    "ipx::add_file_group xilinx_synthesischeckpoint [ipx::current_core]",
+                    f"ipx::add_file dcp/{block_name}.dcp "
+                    f"[ipx::get_file_groups xilinx_synthesischeckpoint]",
+                    "ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]",
+                    f"ipx::add_file dcp/{block_name}.dcp "
+                    f"[ipx::get_file_groups xilinx_simulationcheckpoint]",
+                ]
             )
         # add a rudimentary driver mdd to get correct ranges in xparameters.h later on
         min_driver = get_templates_folder() / "ipcore_driver"
@@ -626,30 +710,30 @@ def apply(self, model):
 """
         )
 
-        # export list of used Verilog files (for rtlsim later on)
-        tcl.append(
-            "set all_v_files [get_files -filter {USED_IN_SYNTHESIS == 1 "
-            + "&& (FILE_TYPE == Verilog || FILE_TYPE == SystemVerilog "
-            + '|| FILE_TYPE =="Verilog Header")}]'
+        # Export list of used Verilog files (for rtlsim later on)
+        v_file_list = f"{vivado_stitch_proj_dir}/all_verilog_srcs.txt"
+        tcl.extend(
+            [
+                "set all_v_files [get_files -filter {USED_IN_SYNTHESIS == 1 "
+                "&& (FILE_TYPE == Verilog || FILE_TYPE == SystemVerilog "
+                '|| FILE_TYPE =="Verilog Header")}]',
+                f"set fp [open {v_file_list} w]",
+                "foreach vf $all_v_files {puts $fp $vf}",
+                "close $fp",
+            ]
         )
-        v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir
-        tcl.append("set fp [open %s w]" % v_file_list)
-        # write each verilog filename to all_verilog_srcs.txt
-        tcl.append("foreach vf $all_v_files {puts $fp $vf}")
-        tcl.append("close $fp")
         # write the project creator tcl script
         tcl_string = "\n".join(tcl) + "\n"
-        with open(vivado_stitch_proj_dir + "/make_project.tcl", "w") as f:
+        with Path(f"{vivado_stitch_proj_dir}/make_project.tcl").open("w") as f:
             f.write(tcl_string)
         # create a shell script and call Vivado
-        make_project_sh = vivado_stitch_proj_dir + "/make_project.sh"
-        working_dir = os.getcwd()
-        with open(make_project_sh, "w") as f:
+        make_project_sh = f"{vivado_stitch_proj_dir}/make_project.sh"
+        working_dir = Path.cwd()
+        with Path(make_project_sh).open("w") as f:
             f.write("#!/bin/bash \n")
-            f.write("cd {}\n".format(vivado_stitch_proj_dir))
-            f.write("set -e\n")  # Exit with non-zero if vivado fails.
+            f.write(f"cd {vivado_stitch_proj_dir}\n")
             f.write("vivado -mode batch -source make_project.tcl\n")
-            f.write("cd {}\n".format(working_dir))
+            f.write(f"cd {working_dir}\n")
         bash_command = ["bash", make_project_sh]
 
         try:
@@ -661,17 +745,22 @@ def apply(self, model):
                 f"{vivado_stitch_proj_dir} to find out why it failed."
             ) from e
 
+        if self.functional_simulation:
+            with Path(v_file_list).open("a") as f:
+                f.write(f"{fifosim_wrapper_filename}\n")
+
         # wrapper may be created in different location depending on Vivado version
-        if not os.path.isfile(wrapper_filename):
+        if not Path(wrapper_filename).is_file():
             # check in alternative location (.gen instead of .srcs)
             wrapper_filename_alt = wrapper_filename.replace(".srcs", ".gen")
-            if os.path.isfile(wrapper_filename_alt):
-                model.set_metadata_prop("wrapper_filename", wrapper_filename_alt)
+            if Path(wrapper_filename_alt).is_file():
+                if not self.functional_simulation:
+                    model.set_metadata_prop("wrapper_filename", wrapper_filename_alt)
             else:
-                raise FINNError(
-                    """CreateStitchedIP failed, no wrapper HDL found under %s or %s.
+                raise FINNUserError(
+                    f"""CreateStitchedIP failed, no wrapper HDL found \
+                        under {wrapper_filename} or {wrapper_filename_alt}.
                     Please check logs under the parent directory."""
-                    % (wrapper_filename, wrapper_filename_alt)
                 )
 
         return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
index ebc60d31d6..8b390eaa00 100644
--- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py
+++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py
@@ -71,7 +71,7 @@ def applyNodeLocal(self, node):
                     # call the compilation function for this node
                     inst.ipgen_singlenode_code()
                 else:
-                    log.info(f"Using pre-existing IP for {node.name}")
+                    log.debug(f"Using pre-existing IP for {node.name}")
                 # ensure that executable path is now set
                 assert (
                     inst.get_nodeattr("ipgen_path") != ""
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 94f2c29605..653d863283 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -133,6 +133,8 @@ def apply(self, model):
 
                         # check if outFIFOdepths attribute of first node
                         # and inFIFOdepths attribute of consumer node is equal
+                        idx_out = min(idx_out, len(n0.get_nodeattr("outFIFODepths")) - 1)
+                        idx_inp = min(idx_inp, len(n1.get_nodeattr("inFIFODepths")) - 1)
                         n0_depth = n0.get_nodeattr("outFIFODepths")[idx_out]
                         n1_depth = n1.get_nodeattr("inFIFODepths")[idx_inp]
 
diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py
index bc1d66c71f..4a2f8997a8 100644
--- a/src/finn/transformation/fpgadataflow/make_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_driver.py
@@ -108,7 +108,7 @@ def resolve_dt_name(s: str) -> str:
         if s in ["BINARY", "TERNARY", "BIPOLAR"]:
             return "Datatype" + s[0] + s[1:].lower()
         elif s.startswith("U"):
-            return "DatatypeUint<" + s.replace("UINT", "") + ">"
+            return "DatatypeUInt<" + s.replace("UINT", "") + ">"
         elif s.startswith("I"):
             return "DatatypeInt<" + s.replace("INT", "") + ">"
         elif "FLOAT" in s:
diff --git a/src/finn/transformation/fpgadataflow/prepare_ip.py b/src/finn/transformation/fpgadataflow/prepare_ip.py
index a60c8b6b49..57539c9afc 100644
--- a/src/finn/transformation/fpgadataflow/prepare_ip.py
+++ b/src/finn/transformation/fpgadataflow/prepare_ip.py
@@ -53,7 +53,7 @@ def _codegen_single_node(node, model, fpgapart, clk):
             # ensure that there is generated code inside the dir
             inst.code_generation_ipgen(model, fpgapart, clk)
         else:
-            log.info(f"Using pre-existing code for {node.name}")
+            log.debug(f"Using pre-existing code for {node.name}")
     except KeyError:
         # exception if op_type is not supported
         raise Exception(f"Custom op_type {op_type} is currently not supported.")
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index a87c00c802..a717354cb3 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -220,11 +220,15 @@ def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, Literal[False]]:
 
 
 def xsi_fifosim(
-    model: ModelWrapper,
-    n_inferences: int,
-    max_iters: float | None = None,
-    throttle_cycles: int = 0,
-) -> dict[str, int]:
+    model,
+    n_inferences,
+    is_single_node,
+    total_nodes: int = 1,
+    current_node_index: int | None = None,
+    previous_node_name: str | None = None,
+    max_iters=None,
+    throttle_cycles=0,
+):
     """Create a XSI model of stitched IP and use a simple C++
     driver to drive the input stream. Useful for FIFO sizing, latency
     and throughput measurement. If max_iters is None, use the default
@@ -243,6 +247,10 @@ def xsi_fifosim(
     ret_dict = rtlsim_exec_cppxsi(
         model,
         ctx,
+        is_single_node,
+        total_nodes=total_nodes,
+        current_node_index=current_node_index,
+        previous_node_name=previous_node_name,
         dummy_data_mode=True,
         timeout_cycles=max_iters,
         throttle_cycles=throttle_cycles,
@@ -405,7 +413,11 @@ def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, Literal[False]]:
             throttle_cycles = 0
 
         sim = xsi_fifosim(
-            model, self.cfg_n_inferences, max_iters=max_iters, throttle_cycles=int(throttle_cycles)
+            model,
+            self.cfg_n_inferences,
+            False,
+            max_iters=max_iters,
+            throttle_cycles=int(throttle_cycles),
         )
 
         for ind, node in enumerate(fifo_nodes):
@@ -451,9 +463,9 @@ def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, Literal[False]]:
                     reset_implementation(node_inst)
                     modified_fc_nodes.remove(node.name)
 
-        assert (
-            len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0
-        ), "FIFO/FC nodes left untouched after model reconfiguration"
+        assert len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0, (
+            "FIFO/FC nodes left untouched after model reconfiguration"
+        )
 
         # handle custom sizing for SWG FIFOs if desired
         if self.swg_exception:
@@ -608,6 +620,10 @@ def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, Literal[False]]:
                     dtype = cast("str", n_inst.get_nodeattr("dataType"))
                     ram_style = n_inst.get_nodeattr("ram_style")
                     shape = model.get_tensor_shape(node.input[0])
+                    log.info(
+                        f"Splitting FIFO {node.name} of depth {depth} "
+                        f"into {len(cfgs)} FIFOs with depths {[c[0] for c in cfgs]}"
+                    )
                     for i, (fifo_depth, impl_style) in enumerate(cfgs):
                         inp = node.input[0] if i == 0 else node.name + "_" + str(i - 1) + "_out"
                         if i == len(cfgs) - 1:
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 6a23727bb3..dc069e9462 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -265,7 +265,7 @@ def apply(self, model):
                     node_inst.set_nodeattr("SIMD", 1)
                     channels_per_stream = node_inst.get_nodeattr("ChannelsPerStream")
                     for simd_val in common_divisors(channels_per_stream):
-                        node_inst.set_nodeattr("SIMD", simd_val)
+                        node_inst.set_nodeattr("SIMD", int(simd_val))
                         cyc = node_inst.get_exp_cycles()
                         if cyc < self.target_cycles_per_frame:
                             break
@@ -274,7 +274,7 @@ def apply(self, model):
                     dim = int(node_inst.get_normal_input_shape()[-1])
                     for simd_val in divisors(dim):
                         if dim // simd_val > 12:
-                            node_inst.set_nodeattr("SIMD", simd_val)
+                            node_inst.set_nodeattr("SIMD", int(simd_val))
                             cyc = node_inst.get_exp_cycles()
                             if cyc < self.target_cycles_per_frame:
                                 break
diff --git a/src/finn/transformation/fpgadataflow/simulation.py b/src/finn/transformation/fpgadataflow/simulation.py
new file mode 100644
index 0000000000..e0d57197b4
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/simulation.py
@@ -0,0 +1,278 @@
+"""Manages the Simulation superclass as well as general simulation related transforms."""
+
+import json
+import pandas as pd
+from pathlib import Path
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from typing import Any, TypeAlias, cast
+
+from finn.builder.build_dataflow_config import DataflowBuildConfig
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.simulation_build import BuildSimulation, SimulationType
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.exception import FINNInternalError, FINNUserError
+from finn.util.logging import log
+
+FIFODepthConfig: TypeAlias = list[dict[str, list[int]]]
+
+
+def store_fifo_data(
+    model: ModelWrapper,
+    data: pd.DataFrame,
+    default_path: Path,
+    delete_existing: bool,
+    sort_on: str = "onnx_index",
+    merge_on: list[str] | None = None,
+    merge_how: str = "inner",
+    store_html: bool = True,
+) -> ModelWrapper:
+    """Store the given dataframe in a CSV file.
+
+    If the model already points to data, merge with it and store at the
+    path used before (unless delete_existing=True, then simply overwrite at that same path).
+    If no data is stored beforehand, use the `default_path` and simply store
+    the data there. The path is then entered into the `"fifo_data_path"` metadata prop of the model.
+
+    The function can be used to aggregate benchmarking data across several flow steps.
+
+    Args:
+        model: The model that we check for a path to existing FIFO data.
+        data: The data to store.
+        default_path: Path to use in case that the model doesn't reference a data file yet.
+           Is then stored as a metadata prop in the model.
+        delete_existing: If true, delete the table and start a new one.
+        sort_on: The column to sort on after merging.
+        merge_on: What columns to merge on. If "None", use `["onnx_index", "node", "stream"]`
+        merge_how: How to merge. Forwarded to pd.merge().
+        store_html: If True, also store the data as a HTML with the same name next to the CSV.
+
+    Returns:
+        model: Return the model since we might have modified its metadata.
+    """
+    # Check if all layers are accounted for
+    # Note: data may have multiple rows per node (one per output stream)
+    if "node" in data.columns:
+        num_unique_nodes = len(data["node"].unique())
+        if num_unique_nodes != len(model.graph.node):
+            raise FINNInternalError(
+                f"Tried storing FIFO data for {num_unique_nodes} unique nodes "
+                f"but expected {len(model.graph.node)}"
+            )
+    elif len(data.index) != len(model.graph.node):
+        raise FINNInternalError(
+            f"Tried storing FIFO data for {len(data.index)} "
+            f"values but expected {len(model.graph.node)}"
+        )
+    fifo_data_path = model.get_metadata_prop("fifo_data_path")
+    if fifo_data_path is not None:
+        if not fifo_data_path.endswith(".csv"):
+            raise FINNInternalError(
+                f"It seems the model saved path to store "
+                f"the dataframe does not point to a csv file: {fifo_data_path}"
+            )
+        if delete_existing:
+            Path(fifo_data_path).unlink(missing_ok=True)
+            merged = data
+        else:
+            merged = pd.merge(
+                data, pd.read_csv(fifo_data_path), on=merge_on, how=merge_how  # type: ignore
+            )
+            merged = merged.sort_values(sort_on)
+        merged.to_csv(fifo_data_path, index=False)
+        if store_html:
+            merged.to_html(fifo_data_path.replace(".csv", ".html"))
+        log.info(f"Stored FIFO dataframe to {fifo_data_path}.")
+    else:
+        if not default_path.suffix == ".csv":
+            raise FINNInternalError(
+                f"It seems the provided default path to store "
+                f"the dataframe does not point to a csv file: {fifo_data_path}"
+            )
+        if delete_existing:
+            default_path.unlink(missing_ok=True)
+        data.to_csv(default_path, index=False)
+        if store_html:
+            data.to_html(str(default_path).replace(".csv", ".html"))
+        model.set_metadata_prop("fifo_data_path", str(default_path))
+        log.info(f"Stored FIFO dataframe to {default_path}.")
+    return model
+
+
+class Simulation:
+    """Manage simulation (runs) in FINN. Upon instance creation, the simulation will be built.
+    Simulations should inherit from this class and expand for their specific needs.
+
+    IMPORTANT: If the modelwrapper was somehow changed, create a NEW simulation object!
+    """
+
+    def __init__(
+        self,
+        model: ModelWrapper,
+        simulation_type: SimulationType,
+        fpgapart: str,
+        clk_ns: float,
+        functional_sim: bool,
+        workers: int | None = None,
+    ) -> None:
+        """Create a new simulation instance. Read simulation binary paths
+        from the simulation_binaries metadata prop field."""
+        self.simulation_type = simulation_type
+        self.model = model
+        sim_binaries = self.model.get_metadata_prop("simulation_binaries")
+
+        if sim_binaries is None:
+            raise FINNUserError(
+                "No field simulation_binaries found in the model. Make "
+                "sure to run the BuildSimulation transformation beforehand."
+            )
+        sim_binaries: list[Path] = [Path(p) for p in str(sim_binaries).split("\n")]
+        if len(sim_binaries) != len(self.model.graph.node):
+            raise FINNUserError(
+                "The number of found simulation binaries does not match the number "
+                "of nodes in the graph. Make sure to run BuildSimulation just "
+                "before."
+            )
+        if any(not p.exists() for p in sim_binaries):
+            raise FINNUserError(
+                "Simulation binary data points to invalid paths. Please rerun BuildSimulation."
+            )
+        # TODO: Currently we have to recompile even if we just
+        # TODO: called BuildSimulation in the step before
+        # (However this only compiles, it should NOT stitch the IPs again)
+        self.model = self.model.transform(BuildSimulation(fpgapart, clk_ns, functional_sim))
+        self.binaries: dict[int, Path] = {i: sim_binaries[i] for i in range(len(sim_binaries))}
+        match simulation_type:
+            case SimulationType.NODE_BASED_CONNECTED:
+                self.binaries = {
+                    i: self.binaries[i] / "LayerSimulationBackend" for i in self.binaries.keys()
+                }
+            case SimulationType.NODE_BASED_ISOLATED:
+                self.binaries = {
+                    i: self.binaries[i] / "IsolatedSimulationBackend" for i in self.binaries.keys()
+                }
+            case _:
+                raise FINNInternalError(f"Unsupported simulation type: {simulation_type}")
+
+        errors = []
+        for binary in self.binaries.values():
+            if not binary.exists():
+                errors.append(f"Binary {binary} does not exist! Please rerun BuildSimulation!")
+        if len(errors) > 0:
+            raise FINNInternalError("Errors occurred: \n" + "\n\t".join(errors))
+
+    def simulate(self) -> Any:
+        raise NotImplementedError("Call simulate() on subclasses.")
+
+
+class ApplyFIFOSizes(Transformation):
+    """Apply a FIFO sizing configuration to the model.
+    If FIFOs already exist the step is skipped."""
+
+    def __init__(
+        self,
+        cfg: DataflowBuildConfig,
+        fifo_config: Path | None = None,
+        max_qsrl_depth: int = 256,
+        vivado_ram_style: str = "block",
+    ) -> None:
+        """If given read the config json from the given path.
+        Otherwise check in the output directory.
+        """
+        self.cfg = cfg
+        self.max_qsrl_depth = max_qsrl_depth
+        self.vivado_ram_style = vivado_ram_style
+        if fifo_config is None:
+            self.path = Path(cfg.output_dir) / "fifo_config.json"
+        else:
+            self.path = fifo_config
+
+        self.fifo_depths: FIFODepthConfig = []
+        with self.path.open() as f:
+            self.fifo_depths = cast("FIFODepthConfig", json.load(f))
+
+    def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, bool]:
+        """Apply FIFO Simulation Depths to the model."""
+        if len(list(filter(lambda node: "StreamingFIFO" in node.op_type, model.graph.node))) > 0:
+            log.warning(
+                "It seems that StreamingFIFOs have already "
+                "been inserted into the graph. Skipping insertion of FIFOs."
+            )
+            return model, False
+
+        if len(model.graph.node) != len(self.fifo_depths):
+            raise FINNUserError(
+                "There are no StreamingFIFOs in the graph, yet the number "
+                "of nodes and number of FIFO sizes differ. There may be "
+                "unaccounted for nodes that have not been part of the FIFO "
+                "simulation. Consider re-running simulation directly before "
+                "applying the FIFO sizes. It might also be that your model "
+                "or config is outdated, in which case it is recommended to "
+                "re-run the entire flow from start to finish."
+            )
+
+        # FIFO sizes are set as the maximum of outFIFODepth and inFIFODepth of the successor node
+        # Only set the outFIFODepth, because setting both is redundant as inFIFODepth defaults to 0.
+        # Remove all in/outFIFODepths in model for clean slate
+        graph = model.graph
+        for node in graph.node:
+            predecessors = model.find_direct_predecessors(node)
+            successors = model.find_direct_successors(node)
+            n = getCustomOp(node)
+            if n is not None:
+                if predecessors is not None:
+                    n.set_nodeattr("inFIFODepths", [0] * len(predecessors))
+                if successors is not None:
+                    n.set_nodeattr("outFIFODepths", [0] * len(successors))
+
+        # Set new outFIFODepths according to config
+        graph = model.graph
+        node_ind = -1
+        for first_node in graph.node:
+            node_ind += 1
+            n0 = getCustomOp(first_node)
+            if n0 is None:
+                raise FINNInternalError(
+                    f"Node {first_node.name} does not have a custom op instance."
+                    " This is required for FIFO insertion."
+                )
+            if first_node.name != self.fifo_depths[node_ind]["node"]:
+                raise FINNInternalError(
+                    f"Node name {first_node.name} does not match expected name "
+                    f"{self.fifo_depths[node_ind]['node']} at index {node_ind}. "
+                    "This may be due to a mismatch between the model and the config, "
+                    "or due to changes in the model after the simulation was run. "
+                    "Consider re-running the entire flow from start to finish."
+                )
+            fifos = cast("list[int]", (self.fifo_depths[node_ind]["depths"]))
+            n0.set_nodeattr("outFIFODepths", fifos)
+
+        # Insert the FIFOs into the model
+        model = model.transform(InsertFIFO(True, self.max_qsrl_depth, self.vivado_ram_style))
+
+        model = model.transform(GiveUniqueNodeNames())
+        model: ModelWrapper = model.transform(GiveReadableTensorNames())
+        model = model.transform(SpecializeLayers(self.cfg._resolve_fpga_part()))  # noqa
+        model = model.transform(GiveUniqueNodeNames())
+        model: ModelWrapper = model.transform(GiveReadableTensorNames())
+
+        # Sanity check to make sure fifos were inserted
+        inserted_fifo_count = sum(
+            [int("StreamingFIFO" in node.op_type) for node in model.graph.node]
+        )
+        if inserted_fifo_count == 0:
+            raise FINNInternalError(
+                "No FIFOs were inserted. This may be due to "
+                "wrong network configuration, step order or "
+                "a number of other things."
+            )
+        if inserted_fifo_count < int(0.4 * float(len(model.graph.node))):
+            log.warning(
+                "The number of inserted FIFOs makes up less than 40%"
+                " of the total number of nodes in the model. This could "
+                "point to a potential error."
+            )
+
+        return model, False
diff --git a/src/finn/transformation/fpgadataflow/simulation_build.py b/src/finn/transformation/fpgadataflow/simulation_build.py
new file mode 100644
index 0000000000..a21102b8f0
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/simulation_build.py
@@ -0,0 +1,832 @@
+"""Build FINN Simulations."""
+
+import finn_xsi.adapter as finnxsi
+import numpy as np
+import onnx
+import os
+import psutil
+import shlex
+import subprocess
+import sys
+import time
+from ast import literal_eval
+from collections.abc import Callable
+from concurrent.futures import Future, ThreadPoolExecutor
+from enum import Enum
+from onnx import NodeProto, TensorProto, ValueInfoProto
+from pathlib import Path
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import get_by_name
+from subprocess import CalledProcessError
+from typing import TYPE_CHECKING, Any, cast
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.basic import launch_process_helper, make_build_dir
+from finn.util.exception import FINNInternalError, FINNUserError
+from finn.util.logging import log
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+# TODO: Fix that BuildSimulation has to return binaries for either SimulationType
+# TODO: Just store the directory instead - since we build all targets anyways
+
+
+class SimulationType(str, Enum):
+    """Type of simulation."""
+
+    # Individual node simulations connected by IPC
+    NODE_BASED_CONNECTED = "NODE_BASED_CONNECTED"
+
+    # Individual node simulations, isolated. E.g. for analysis purposes
+    NODE_BASED_ISOLATED = "NODE_BASED_ISOLATED"
+
+
+class SimulationBuilder:
+    """Build simulations in FINN."""
+
+    def __init__(self, model: ModelWrapper, fpgapart: str, clk_ns: float) -> None:
+        """Create a new simulation instance."""
+        self.model = model
+        self.fpgapart = fpgapart
+        self.clk_ns = clk_ns
+
+    def _isolated_node_model(self, by_node: int | str | NodeProto) -> ModelWrapper:
+        """Return a modelwrapper that has only the specified node.
+
+        Args:
+            by_node: If int, used as the index of the specified node. If string, assumed to be
+                        the name of the node.
+
+        Returns:
+            ModelWrapper: The isolated-node modelwrapper.
+        """
+        # Find the node
+        index = 0
+        if type(by_node) is int:
+            if by_node < 0 or by_node >= len(self.model.graph.node):
+                raise FINNInternalError(
+                    f"Cannot isolate node index {by_node}. Model has"
+                    f"{len(self.model.graph.node)} nodes."
+                )
+            index = by_node
+        elif type(by_node) is str:
+            node_name = self.model.get_node_from_name(by_node)
+            if node_name is None:
+                raise FINNInternalError(f"Cannot isolate node {by_node}. No such node found.")
+            index = [n.name for n in self.model.graph.node].index(cast("str", node_name))
+        elif type(by_node) is NodeProto:
+            try:
+                index = self.model.graph.node.index(by_node)
+            except Exception as e:
+                raise FINNInternalError(f"Node {by_node.name} not found in the model.") from e
+        else:
+            raise FINNInternalError(
+                f"Cannot find node to isolate: {by_node}. Specify either "
+                f"the index (int), node name (str) or the object itself "
+                f"(NodeProto)."
+            )
+
+        target_op = getCustomOp(self.model.graph.node[index])
+        if not isinstance(target_op, HWCustomOp):
+            raise FINNInternalError(
+                f"Node {target_op.name} is not a HWCustomOp, cannot isolate for simulation."
+            )
+
+        initializers: list[TensorProto] = []
+        value_info_protos: list[ValueInfoProto] = []
+        inputs_graph: list[ValueInfoProto] = []
+        inputs_node: list[ValueInfoProto] = []
+        outputs_graph: list[ValueInfoProto] = []
+        outputs_node: list[ValueInfoProto] = []
+        nodes_graph: list[NodeProto] = []
+
+        preds_list: list | None = self.model.find_direct_predecessors(self.model.graph.node[index])
+        succs_list: list | None = self.model.find_direct_successors(self.model.graph.node[index])
+
+        num_preds = len(preds_list) if preds_list is not None else 0
+        num_succs = len(succs_list) if succs_list is not None else 0
+
+        input_node = False
+        output_node = False
+
+        # Set correct input/output count for input and output nodes, since they have no pred/succ.
+        if num_preds == 0:
+            inputs = self.model.graph.input
+            ret = get_by_name(
+                inputs, self.model.graph.node[index].input[0]
+            )  # Check that node is graph input
+            if ret is not None:
+                num_preds = 1
+                input_node = True
+        if num_succs == 0:
+            outputs = self.model.graph.output
+            ret = get_by_name(
+                outputs, self.model.graph.node[index].output[0]
+            )  # Check that node is graph output
+            if ret is not None:
+                num_succs = 1
+                output_node = True
+
+        num_inputs = len(self.model.graph.node[index].input)
+        num_outputs = len(self.model.graph.node[index].output)
+
+        if num_outputs != num_succs:
+            raise FINNInternalError(
+                f"Node {self.model.graph.node[index].name} has {num_outputs} outputs but "
+                f"{num_succs} successor nodes. This is not supported for isolation."
+            )
+
+        initializer_inputs_list = [
+            self.model.graph.node[index].input[i]
+            for i in range(num_inputs)
+            if self.model.get_initializer(self.model.graph.node[index].input[i]) is not None
+        ]
+
+        # Handle initializers of nodes
+        initializer_inputs = []
+        for init in initializer_inputs_list:
+            ret = self.model.get_initializer(init, return_dtype=True)
+            info = self.model.get_tensor_valueinfo(init)
+            if ret is None or info is None:
+                raise FINNInternalError(
+                    f"Failed to get initializer for {init} "
+                    f"while isolating node {self.model.graph.node[index].name}."
+                )
+            vals, dtype = cast("tuple[np.ndarray, int]", ret)
+            initializers.append(onnx.helper.make_tensor(info.name, dtype, vals.shape, vals))
+            val_info = onnx.helper.make_tensor_value_info(info.name, dtype, vals.shape)
+            value_info_protos.append(val_info)
+            initializer_inputs.append(val_info)
+
+        pred_count = 0
+        for i in range(num_inputs):
+            if self.model.graph.node[index].input[i] in initializer_inputs_list:
+                continue  # This input is handled as an initializer, skip
+            pred_count += 1
+            info = self.model.get_tensor_valueinfo(self.model.graph.node[index].input[i])
+            if info is None:
+                raise FINNInternalError(
+                    f"Failed to get value info for {self.model.graph.node[index].input[i]} "
+                    f"while isolating node {self.model.graph.node[index].name}."
+                )
+            # Setup new input tensors
+            new_input_info = onnx.helper.make_tensor_value_info(
+                info.name,
+                TensorProto.FLOAT,
+                cast("Sequence[int]", target_op.get_normal_input_shape(i)),
+            )
+            new_input_dummy_info = onnx.helper.make_tensor_value_info(
+                info.name + "_dummy",
+                TensorProto.FLOAT,
+                cast("Sequence[int]", target_op.get_normal_input_shape(i)),
+            )
+            # value_info_protos.append(new_input_info)
+            value_info_protos.append(new_input_dummy_info)
+            inputs_graph.append(new_input_info)
+            inputs_node.append(new_input_dummy_info)
+
+            # Create new dummy node to remove data path for input i
+            dummy_node = onnx.helper.make_node(
+                "RemoveDataPath_rtl",
+                inputs=[new_input_info.name],
+                outputs=[new_input_dummy_info.name],
+                domain="finn.custom_op.fpgadataflow.rtl",
+                backend="fpgadataflow",
+                folded_shape=target_op.get_folded_input_shape(i),
+                normal_shape=target_op.get_normal_input_shape(i),
+                dataType=target_op.get_input_datatype(i).name,
+                name=self.model.graph.node[index].name + f"_input_dummy_{i}",
+            )
+
+            nodes_graph.append(dummy_node)
+        inputs_node.extend(initializer_inputs)
+        if pred_count != num_preds:
+            raise FINNInternalError(
+                f"Node {self.model.graph.node[index].name} has {num_preds} pred. nodes but only "
+                f"{pred_count} inputs have been handled."
+            )
+        for i in range(num_succs):
+            info = self.model.get_tensor_valueinfo(self.model.graph.node[index].output[i])
+            if info is None:
+                raise FINNInternalError(
+                    f"Failed to get value info for {self.model.graph.node[index].output[i]} "
+                    f"while isolating node {self.model.graph.node[index].name}."
+                )
+            # Setup new input tensors
+            new_output_info = onnx.helper.make_tensor_value_info(
+                info.name,
+                TensorProto.FLOAT,
+                cast("Sequence[int]", target_op.get_normal_output_shape(i)),
+            )
+            new_output_dummy_info = onnx.helper.make_tensor_value_info(
+                info.name + "_dummy",
+                TensorProto.FLOAT,
+                cast("Sequence[int]", target_op.get_normal_output_shape(i)),
+            )
+            # value_info_protos.append(new_output_info)
+            value_info_protos.append(new_output_dummy_info)
+            outputs_graph.append(new_output_info)
+            outputs_node.append(new_output_dummy_info)
+
+            # Create new dummy node to remove data path for output i
+            dummy_node = onnx.helper.make_node(
+                "RemoveDataPath_rtl",
+                inputs=[new_output_dummy_info.name],
+                outputs=[new_output_info.name],
+                domain="finn.custom_op.fpgadataflow.rtl",
+                backend="fpgadataflow",
+                folded_shape=target_op.get_folded_output_shape(i),
+                normal_shape=target_op.get_normal_output_shape(i),
+                dataType=target_op.get_output_datatype(i).name,
+                name=self.model.graph.node[index].name + f"_output_dummy_{i}",
+            )
+
+            nodes_graph.append(dummy_node)
+
+        target_op_attrs = target_op.get_nodeattr_types()
+        params = {}
+        for attr in target_op_attrs.keys():
+            attr_val = target_op.get_nodeattr(attr)
+            if (
+                (isinstance(attr_val, np.ndarray) and attr_val.size == 0)
+                or attr_val == ""
+                or attr_val == []
+            ):  # Empty value, skip
+                continue
+            params[attr] = target_op.get_nodeattr(attr)
+        new_node = onnx.helper.make_node(
+            self.model.graph.node[index].op_type,
+            inputs=[inp.name for inp in inputs_node],
+            outputs=[outp.name for outp in outputs_node],
+            domain=self.model.graph.node[index].domain,
+            name=self.model.graph.node[index].name,
+            **params,
+        )
+        nodes_graph.append(new_node)
+
+        graph = onnx.helper.make_graph(
+            nodes_graph,
+            f"isolated_node_graph_{self.model.graph.node[index].name}",
+            inputs_graph,
+            outputs_graph,
+            initializer=initializers,
+            value_info=value_info_protos,
+        )
+
+        node_model = onnx.helper.make_model(graph)
+        node_model = ModelWrapper(node_model)
+
+        node_model.set_metadata_prop("predecessors", str([pred.name for pred in inputs_graph]))
+        node_model.set_metadata_prop("successors", str([succ.name for succ in outputs_graph]))
+        node_model.set_metadata_prop("input_node", str(input_node).lower())
+        node_model.set_metadata_prop("output_node", str(output_node).lower())
+
+        # node_model.save(f"isolated_node_model_{self.model.graph.node[index].name}.onnx")
+
+        return node_model
+
+    def _get_stream_descriptions(self, model: ModelWrapper) -> tuple[str, str]:
+        """Return the stream descriptions for the given model for the C++ sim config header.
+
+        Used by for example _build_single_node_simulation().
+
+        Returns:
+            tuple[str, str]: Strings of stream descriptions
+        """
+        # Get IO iterations required
+        instream_iters = []
+        outstream_iters = []
+        for top_inp in model.graph.input:
+            iname = top_inp.name
+            first_node = model.find_consumer(iname)
+            assert first_node is not None, "Failed to find consumer for " + iname
+            top_ind = list(first_node.input).index(iname)
+            ishape_folded = getCustomOp(first_node).get_folded_input_shape(ind=top_ind)
+            instream_iters.append(int(np.prod(ishape_folded[:-1])))
+        for top_out in model.graph.output:
+            oname = top_out.name
+            last_node = model.find_producer(oname)
+            assert last_node is not None, "Failed to find producer for " + oname
+            top_ind = list(last_node.output).index(oname)
+            oshape_folded = getCustomOp(last_node).get_folded_output_shape(ind=top_ind)
+            outstream_iters.append(int(np.prod(oshape_folded[:-1])))
+
+        interface_names = model.get_metadata_prop("vivado_stitch_ifnames")
+        if interface_names is None:
+            raise FINNInternalError(
+                f"{model}: Could not find stitched-IP interface names. "
+                f"Did you run IP Stitching first?"
+            )
+        interface_names = literal_eval(interface_names)
+        if "aximm" in interface_names.keys() and interface_names["aximm"] != []:
+            raise FINNInternalError(
+                f"{model}: CPP XSI Sim does not know how to handle full "
+                f"AXI MM interfaces: {interface_names['aximm']}"
+            )
+        instream_names = [x[0] for x in interface_names["s_axis"]]
+        outstream_names = [x[0] for x in interface_names["m_axis"]]
+
+        # Convert to the format required by the C++ sim config header
+        # (initializer list of pairs of name and iters)
+        def _format_descr_name(s: list[tuple[str, int]]) -> str:
+            return ", ".join([f'StreamDescriptor{{"{name}", {iters}}}' for name, iters in s])
+
+        instream_descrs = [
+            (instream_names[i], instream_iters[i]) for i in range(len(instream_names))
+        ]
+        instream_descrs_str = _format_descr_name(instream_descrs)
+
+        outstream_descrs = [
+            (outstream_names[i], outstream_iters[i]) for i in range(len(outstream_names))
+        ]
+        outstream_descrs_str = _format_descr_name(outstream_descrs)
+        return instream_descrs_str, outstream_descrs_str
+
+    def _create_sim_so(
+        self,
+        model: ModelWrapper,
+        top_module_name: str,
+        vivado_stitched_proj: Path,
+        build_dir: Path | None,
+        debug: bool,
+    ) -> tuple[Path, Path]:
+        """Create a new RTLSim .so file. If one exists already it is used.
+
+        Returns:
+            tuple[Path, Path]: Return sim_base and sim_rel.
+        """
+        rtlsim_so_str = model.get_metadata_prop("rtlsim_so")
+        if (rtlsim_so_str is None) or not Path(rtlsim_so_str).exists():
+            all_verilog_srcs = (
+                (Path(vivado_stitched_proj) / "all_verilog_srcs.txt").read_text().split()
+            )
+            sim_dir = (
+                make_build_dir(f"rtlsim_{model.graph.node[0].name}_")
+                if build_dir is None
+                else build_dir
+            )
+            sim_base, sim_rel = finnxsi.compile_sim_obj(
+                top_module_name, all_verilog_srcs, str(sim_dir), debug=debug
+            )
+            rtlsim_so = Path(sim_base) / Path(sim_rel)
+            model.set_metadata_prop("rtlsim_so", str(rtlsim_so))
+        else:
+            sim_base, sim_rel = cast("str", rtlsim_so_str.split("xsim.dir"))
+            sim_rel = "xsim.dir" + sim_rel
+        return Path(sim_base), Path(sim_rel)
+
+    def _compile_simulation(self, sim_base: Path, silent: bool = True) -> Path:
+        """Compile an existing RTLSIM directory. Requires _create_sim_so to be run before. Expects
+        rtlsim_config.hpp to be templated already.
+
+        Returns:
+            Path: Path to the executable shell script to run the binary
+        """
+        # Determine executable name
+        compile_targets = ["LayerSimulationBackend", "IsolatedSimulationBackend"]
+        if all((Path(sim_base) / execname).exists() for execname in compile_targets):
+            # Simulation was already compiled, we can return early
+            return Path(sim_base)
+
+        # Check where FINNXSI is
+        finnxsi_dir = os.environ["FINN_XSI"]
+
+        # Running CMake first
+        cmake_call = f"{sys.executable} -m cmake -S {finnxsi_dir} -B {sim_base}"
+        log.debug(f"Running cmake on RTLSIM Wrapper in {sim_base}")
+        try:
+            launch_process_helper(
+                shlex.split(cmake_call),
+                cwd=finnxsi_dir,
+                print_stdout=silent,
+                print_stderr=silent,
+                proc_env=os.environ.copy(),
+            )
+        except CalledProcessError as e:
+            raise FINNInternalError(f"Failed to run cmake in {sim_base}") from e
+
+        # Calling make to actually build the simulation
+        makefile = Path(sim_base) / "Makefile"
+        if not makefile.exists():
+            raise FINNInternalError(f"Failed to create Makefile in {sim_base}!")
+        try:
+            launch_process_helper(
+                ["make"],
+                proc_env=os.environ.copy(),
+                cwd=sim_base,
+                print_stdout=silent,
+                print_stderr=silent,
+            )
+        except CalledProcessError as e:
+            raise FINNInternalError(f"Failed to create executable in {sim_base}!") from e
+
+        errors = []
+        for target in compile_targets:
+            simulation_executable = Path(sim_base) / target
+            if not simulation_executable.exists():
+                errors.append(
+                    f"Simulation compile target {target} was not created. "
+                    f"Check {sim_base} to run make manually."
+                )
+        if len(errors) > 0:
+            raise FINNInternalError("Error compiling simulations: \n" + "\n\t".join(errors))
+        return sim_base
+
+    def _template_rtlsim_config(
+        self,
+        model: ModelWrapper,
+        sim_base: Path,
+        input_interface_names: list[str] | None,
+        output_interface_names: list[str] | None,
+        node_index: int,
+        total_nodes: int,
+        timeout_cycles: int,
+        top_module_name: str,
+        trace_file: str | None,
+    ) -> Path:
+        """Template finn_xsi/finn_xsi/rtlsim_config.hpp.template with the correct values and
+        return the templated file.
+        """
+        finnxsi_dir = os.environ["FINN_XSI"]
+        # Prepare the C++ driver config template
+        (
+            instream_descrs_str,
+            outstream_descrs_str,
+        ) = self._get_stream_descriptions(model)
+        template_dict = {
+            "TIMEOUT_CYCLES": timeout_cycles,
+            # name of the top-level HDL module
+            "TOP_MODULE_NAME": top_module_name,
+            # top-level AXI stream descriptors
+            "ISTREAM_DESC": instream_descrs_str,
+            "OSTREAM_DESC": outstream_descrs_str,
+            # control tracing and trace filename
+            "TRACE_FILE": "std::nullopt" if trace_file is None else f'"{trace_file}"',
+            # sim kernel .so to use (depends on Vivado version)
+            "SIMKERNEL_SO": finnxsi.get_simkernel_so(),
+            # log file for xsi (not the sim driver)
+            "XSIM_LOG_FILE": '"xsi.log"',
+            "INPUT_INTERFACE_NAMES": ",".join(['"' + name + '"' for name in input_interface_names])
+            if input_interface_names is not None
+            else "",
+            "OUTPUT_INTERFACE_NAMES": ",".join(
+                ['"' + name + '"' for name in output_interface_names]
+            )
+            if output_interface_names is not None
+            else "",
+            "INPUT_INTERFACE_COUNT": len(input_interface_names)
+            if input_interface_names is not None
+            else 0,
+            "OUTPUT_INTERFACE_COUNT": len(output_interface_names)
+            if output_interface_names is not None
+            else 0,
+            "NODE_INDEX": node_index,
+            "TOTAL_NODES": total_nodes,
+            "IS_INPUT_NODE": model.get_metadata_prop("input_node"),
+            "IS_OUTPUT_NODE": model.get_metadata_prop("output_node"),
+        }
+
+        fifosim_config_fname = Path(finnxsi_dir) / "rtlsim_config.hpp.template"
+        fsim_config = fifosim_config_fname.read_text()
+        for key, val in template_dict.items():
+            fsim_config = fsim_config.replace(f"@{key}@", str(val))
+        # Write the config to the simulation directory
+        rtlsim_config = Path(sim_base) / "rtlsim_config.hpp"
+        rtlsim_config.write_text(fsim_config)
+        return rtlsim_config
+
+    def build_single_node_simulation(
+        self,
+        node_model: ModelWrapper,
+        node_index: int,
+        total_nodes: int,
+        input_interface_names: list[str] | None,
+        output_interface_names: list[str] | None,
+        build_dir: Path | None,
+        timeout_cycles: int = 0,
+        silent: bool = False,
+    ) -> Path:
+        """Build the simulation binary for a single node.
+
+        This can be used both by the connected node-by-node sim and the isolated node sim.
+
+        Much of this is from the rtlsim_exec.py in core/
+
+        Args:
+            node_model: The single node ModelWrapper to build the simulation from.
+            node_index: The index of the simulated node. Used to determine whether a node shares IO
+                        with successors or predecessors.
+            total_nodes: The total number of nodes in the complete design.
+            input_interface_names: Names of input interfaces for IPC communication. Required by the
+                                connected simulation to access the correct shared memory segment
+                                between this node and its predecessors.
+            output_interface_names: Names of output interfaces for IPC communication. Required by
+                                the connected simulation to access the correct shared memory segment
+                                between this node and its successors.
+            build_dir: If given, use this directory for building the simulation. Otherwise one is
+                        created from the nodes name.
+            timeout_cycles: Number of cycles until simulation timeout. When set to 0 (default), no
+                            timeout is given.
+            silent: If True, silences the Cmake and make output (including stderr)
+
+        Returns:
+            Path: The path to the simulation binary (shell script).
+        """
+        # TODO: Check if something is an output node instead of checking the node index
+        # TODO: Requires changes in the C++ code as well
+
+        # Check that the relevant data exists
+        wrapper_filename = node_model.get_metadata_prop("wrapper_filename")
+        if wrapper_filename is None or not Path(wrapper_filename).exists():
+            raise FINNUserError(
+                f"Call CreateStitchedIP prior to building "
+                f"the simulation for {self.model.graph.node[node_index].name}. "
+                f"wrapper_filename is set to {wrapper_filename}!"
+            )
+
+        vivado_stitched_proj = node_model.get_metadata_prop("vivado_stitch_proj")
+        if vivado_stitched_proj is None or not Path(vivado_stitched_proj).exists():
+            raise FINNUserError(
+                f"Call CreateStitchedIP prior to building "
+                f"the simulation for {self.model.graph.node[node_index].name}."
+                "(vivado_stitch_proj not set!)"
+            )
+
+        trace_file = cast("str | None", node_model.get_metadata_prop("rtlsim_trace"))
+        debug = not (trace_file is None or trace_file == "")
+
+        # Get the module name and path
+        top_module_file = Path(wrapper_filename).resolve().absolute()
+        top_module_name = top_module_file.name.strip(".v")
+
+        # Build the simulation .so and save it in the "rtlsim_so" metadata prop
+        sim_base, _ = self._create_sim_so(
+            node_model, top_module_name, Path(vivado_stitched_proj), build_dir, debug
+        )
+
+        # Fill out the simulation config header
+        _ = self._template_rtlsim_config(
+            node_model,
+            sim_base,
+            input_interface_names,
+            output_interface_names,
+            node_index,
+            total_nodes,
+            timeout_cycles,
+            top_module_name,
+            trace_file,
+        )
+
+        # Building the whole simulation
+        return self._compile_simulation(sim_base, silent=silent).absolute()
+
+    def _build_simulations_parallel(
+        self, with_live_display: bool, functional_sim: bool
+    ) -> dict[int, Path]:
+        """Build all nodes in the model in parallel, as isolated simulations, ready for usage in
+        an IPC connected simulation chain.
+
+        Args:
+            workers: Number of parallel workers to use.
+            with_live_display: If True, display the building progress in a rich progress bar.
+            functional_sim: Use a functional simulation (faster but takes time to build)
+            sim_type: Type of simulation
+
+        Returns:
+            Dict of executables that start the simulation of the given nodes,
+            indexed by the node-index. These are in their respective FINN_TMP
+            directories.
+        """
+        log.info(f"Building simulation binaries for {len(self.model.graph.node)} layers.")
+
+        def _build(
+            node_index: int,
+            total_nodes: int,
+            build_dir: Path,
+        ) -> Any:
+            nodemodel = self._isolated_node_model(node_index)
+            nodemodel = nodemodel.transform(InferShapes())
+            nodemodel = nodemodel.transform(PrepareIP(self.fpgapart, self.clk_ns))
+            nodemodel = nodemodel.transform(
+                CreateStitchedIP(self.fpgapart, self.clk_ns, functional_simulation=functional_sim)
+            )
+            input_interface_names = nodemodel.get_metadata_prop("predecessors")
+            if input_interface_names is not None:
+                input_interface_names = literal_eval(input_interface_names)
+            output_interface_names = nodemodel.get_metadata_prop("successors")
+            if output_interface_names is not None:
+                output_interface_names = literal_eval(output_interface_names)
+            return self.build_single_node_simulation(
+                nodemodel,
+                node_index,
+                total_nodes,
+                input_interface_names,
+                output_interface_names,
+                build_dir,
+                silent=with_live_display,
+            )
+
+        total_nodes = len(self.model.graph.node)
+        log.info(f"[BuildSimulation] Preparing to build {total_nodes} nodes for the simulation.")
+        futures: dict[int, Future] = {}
+        built_nodes = 0
+
+        # Progress display callback
+        def _callback_progress(name: str) -> Callable:
+            nonlocal total_nodes, built_nodes
+
+            def _f(f: Future) -> None:
+                nonlocal total_nodes, built_nodes
+                built_nodes += 1
+                log.info(
+                    f"[ [bold green]"
+                    f"{int(100.0 * float(built_nodes) / float(total_nodes))}%[/bold green]"
+                    f" ] {name}",
+                    extra={"markup": True, "highlighter": None},
+                )
+                # Unpack result once so that the pool fails immediately, instead of waiting for
+                # all futures to be completed.
+                f.result()
+
+            return _f
+
+        # Build sims in parallel
+        synth_workers = max(
+            1, cast("int", (psutil.virtual_memory().free / 1024 / 1024 / 1024) // 10)
+        )  # 10GB per synthesis
+        if not functional_sim:
+            # When not having to do synthesis, the build is not memory bottlenecked and
+            # can be executed as parallel as possible
+            synth_workers = int(os.environ.get("NUM_DEFAULT_WORKERS", len(self.model.graph.node)))
+
+        # Build (stitched IP, cmake, make) all sims in parallel and return paths to
+        # the compiled executables
+        log.info("[BuildSimulation] Starting the build process.")
+        with ThreadPoolExecutor(max_workers=synth_workers) as pool:
+            for i in range(total_nodes):
+                node_name = self.model.graph.node[i].name
+                futures[i] = pool.submit(
+                    _build,
+                    i,
+                    total_nodes - 1,
+                    Path(make_build_dir(f"rtlsim_{node_name}_")),
+                )
+                futures[i].add_done_callback(_callback_progress(node_name))
+            pool.shutdown(wait=True)
+
+        # Check if all binaries were compiled successfully
+        binaries = {i: future.result() for i, future in futures.items()}
+        not_found_binaries = []
+        for i, binary in binaries.items():
+            if binary is None:
+                not_found_binaries.append(i)
+        if len(not_found_binaries) > 0:
+            raise FINNInternalError(
+                "Building simulations failed. "
+                "Failed simulation binaries: " + ", ".join(not_found_binaries)
+            )
+        return binaries
+
+    def build_simulation(self, with_live_display: bool, functional_sim: bool) -> dict[int, Path]:
+        """Build a simulation of the given type, return the path to the executable directory
+        (indexed by the corresponding node index in the graph).
+
+        Args:
+            simtype: Simulation type to build.
+            workers: Number of workers to use in parallel.
+                Normally set by the Simulation() class automatically.
+            with_live_display: If True, display a live progress-bar.
+            functional_sim: If True, use functional simulation (faster but takes some time to build)
+        """
+        return self._build_simulations_parallel(with_live_display, functional_sim)
+
+
+class BuildSimulation(Transformation):
+    """Build a simulation of the given type for the model.
+    Puts the model into a prepared state (changes the graph).
+    If simulation binaries already exist, enter their directory and only re-compile."""
+
+    def __init__(
+        self,
+        fpgapart: str,
+        clk_ns: float,
+        functional_sim: bool,
+    ) -> None:
+        """Create a new BuildSimulation transform."""
+        self.functional_sim = functional_sim
+        self.fpgapart = fpgapart
+        self.clk_ns = clk_ns
+
+    def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, bool]:
+        """Build / compile the model. Modifies the model."""
+        self.model = model
+
+        # Check if we already have stitched IPs and built simulations. If so, rerun only cmake/make
+        needs_rebuild = True
+        sim_binaries = self.model.get_metadata_prop("simulation_binaries")
+
+        # 1. Check if binary paths are saved in the model
+        if sim_binaries is not None:
+            sim_binaries = sim_binaries.split("\n")
+
+            # 2. Check that the model size hasn't changed since creating the binaries. Otherwise
+            # we should rebuild.
+            if len(sim_binaries) != len(self.model.graph.node):
+                log.info(
+                    f"[BuildSimulation] Found existing binaries, but number ({len(sim_binaries)}) "
+                    f"does not match number of nodes in the graph "
+                    f"({len(self.model.graph.node)}). Rebuilding..."
+                )
+            else:
+                log.info("Existing simulations found. Re-running only CMake/Make..")
+                needs_rebuild = False
+        else:
+            log.info("[BuildSimulation] No simulation binaries found, building now.")
+
+        # If needed, call the Builder to create the layer simulation binaries.
+        # This creates both the isolated and connected binaries in one go.
+        if needs_rebuild:
+            log.info("[BuildSimulation] Starting model preparation.")
+            self._prepare_model()
+            self.builder = SimulationBuilder(self.model, self.fpgapart, self.clk_ns)
+            sys.stdout = sys.stdout.console  # type: ignore
+            self.binaries = self.builder.build_simulation(
+                with_live_display=False,
+                functional_sim=self.functional_sim,
+            )
+            self.model.set_metadata_prop(
+                "simulation_binaries", "\n".join([str(p) for p in self.binaries.values()])
+            )
+        else:
+            # Run only compilation again, and avoid repeating building of the stitched IPs
+            def _compile(binary: Path) -> None:
+                result = subprocess.run(
+                    "cmake .;make",
+                    shell=True,
+                    cwd=str(binary),
+                    text=True,
+                    capture_output=True,
+                )
+                if result.returncode != 0:
+                    raise FINNUserError(f"Failed compilation in {binary}: {result.stderr}")
+
+            # Since we dont need a rebuild, sim_binaries contains the paths to the binaries
+            sim_binaries = [Path(p) for p in sim_binaries]
+            total = len(sim_binaries)
+
+            # Prepare compiling the binaries again
+            done = 0
+
+            def _progress_callback(binary: str | Path) -> Callable:
+                nonlocal done, total
+
+                def _f(future: Future) -> None:
+                    nonlocal done, total
+                    done += 1
+                    log.info(
+                        f"[ [bold green]{int(100.0 * float(done) / float(total))}%[/bold green] ] "
+                        f"Simulation [green italic]{binary}[/green italic] built.",
+                        extra={"markup": True, "highlighter": None},
+                    )
+                    future.result()
+
+                return _f
+
+            # Run the compilation in parallel with the number of workers specified.
+            # If not specified, use 8
+            compile_start = time.time()
+            futures: list[Future] = []
+            with ThreadPoolExecutor(int(os.environ.get("NUM_DEFAULT_WORKERS", "8"))) as tpe:
+                for binary in sim_binaries:
+                    futures.append(tpe.submit(_compile, binary))
+                    futures[-1].add_done_callback(_progress_callback(binary.name))
+            tpe.shutdown()
+            compile_end = time.time()
+            log.info(f"Compilation done. Took {compile_end - compile_start} seconds")
+        return self.model, False
+
+    def _prepare_model(self) -> None:
+        """Execute some preparation transformations on the model."""
+        log.info("[BuildSimulation] Inserting DataWidthConverters...")
+        self.model = self.model.transform(InsertDWC())
+        log.info("[BuildSimulation] Specializing layers...")
+        self.model = self.model.transform(SpecializeLayers(self.fpgapart))
+        log.info("[BuildSimulation] Assigning unique and readable node and tensor names...")
+        self.model = self.model.transform(GiveUniqueNodeNames())
+        self.model = self.model.transform(GiveReadableTensorNames())
+        log.info("[BuildSimulation] Preparing IPs...")
+        self.model = self.model.transform(PrepareIP(self.fpgapart, self.clk_ns))
+        log.info("[BuildSimulation] Synthesizing IPs...")
+        self.model = self.model.transform(HLSSynthIP())
+        log.info("[BuildSimulation] Model preparation done.")
diff --git a/src/finn/transformation/fpgadataflow/simulation_connected.py b/src/finn/transformation/fpgadataflow/simulation_connected.py
new file mode 100644
index 0000000000..7aca8ac069
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/simulation_connected.py
@@ -0,0 +1,1628 @@
+"""Node connected parallel simulations."""
+
+import glob
+import json
+import math
+import os
+import pandas as pd
+import time
+import traceback
+from concurrent.futures import Future, ThreadPoolExecutor
+from copy import deepcopy
+from enum import Enum
+from pathlib import Path
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from rich.console import Console
+from threading import Barrier
+from typing import Any, cast
+
+from finn.builder.build_dataflow_config import DataflowBuildConfig
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.transformation.fpgadataflow.set_fifo_depths import get_fifo_split_configs
+from finn.transformation.fpgadataflow.simulation import Simulation, SimulationType, store_fifo_data
+from finn.transformation.fpgadataflow.simulation_controller import SimulationController
+from finn.util.basic import make_build_dir
+from finn.util.exception import FINNInternalError, FINNUserError
+from finn.util.logging import log
+
+
+# Hardware BRAM FIFOs lose entries to internal pipeline registers compared to the software FIFO
+# model (which has exact capacity). This constant accounts for that overhead so that the
+# minimization algorithm finds depths that are safe to deploy on hardware.
+BRAM_FIFO_PIPELINE_OVERHEAD = 2
+
+
+def _count_bram_sub_fifos(depth: int, max_qsrl_depth: int) -> int:
+    """Return the number of BRAM (vivado) sub-FIFOs that *depth* decomposes into.
+
+    Non-power-of-two BRAM FIFOs are decomposed into several power-of-two sub-FIFOs by
+    get_fifo_split_configs.  Each sub-FIFO whose style is "vivado" has its own pipeline
+    register overhead, so the total overhead scales with the sub-FIFO count.
+    """
+    return sum(1 for _, style in get_fifo_split_configs(depth, max_qsrl_depth) if style == "vivado")
+
+
+def _safe_bram_starting_depth(peak_util: int, max_qsrl_depth: int) -> int:
+    """Return the smallest depth d such that d minus its BRAM pipeline overhead >= peak_util + 1.
+
+    For LUTRAM depths (d <= max_qsrl_depth) the software model is exact so no overhead is needed.
+    For BRAM depths the overhead depends on how many sub-FIFOs the decomposition produces,
+    which itself depends on d.  We iterate (typically 1-2 steps) until the overhead stabilises.
+    """
+    d = max(peak_util + 1, 32)
+    if d <= max_qsrl_depth:
+        return d
+    # Iteratively find d where d - num_vivado(d)*overhead >= peak_util + 1
+    overhead = 0
+    while True:
+        d = peak_util + 1 + overhead
+        num_vivado = _count_bram_sub_fifos(d, max_qsrl_depth)
+        new_overhead = num_vivado * BRAM_FIFO_PIPELINE_OVERHEAD
+        if new_overhead <= overhead:
+            break
+        overhead = new_overhead
+    return max(d, 32)
+
+
+class MinimizationOrder(Enum):
+    """The order in which the search algorithm minimizes the FIFO depths."""
+
+    NODE_ORDER = 0
+    REVERSE_NODE_ORDER = 1
+    LARGEST_BITWIDTH_DIFF_FIRST = 2
+    SMALLEST_BITWIDTH_DIFF_FIRST = 3
+
+    # Non black-box model orders
+    AFTER_THRESHOLDS_FIRST = 4
+    AFTER_DWC_FIRST = 5
+
+    # Half black-box
+    # If we ran a sim before, we know the largest FIFOs, so start with these.
+    # This strategy might work, if the changes to the model are small enough
+    REUSE_PREVIOUS_ORDER = 6
+
+
+class NodeConnectedSimulationController(SimulationController):
+    """Run simulations for node connected cases."""
+
+    def __init__(
+        self,
+        parallel_simulations: int,
+        names: list[str],
+        binaries: list[Path],
+        console: Console,
+        poll_interval: float = 1.0,
+        with_progressbar: bool = True,
+    ) -> None:
+        """Set up node connected simulation."""
+        super().__init__(
+            parallel_simulations, names, binaries, console, poll_interval, with_progressbar
+        )
+        # Synchronization barrier for configuration phase
+        self.sync_barrier: Barrier | None = None
+        for binary in binaries:
+            if not binary.exists():
+                console.log(f"Binary {binary} does not exist!")
+                raise FINNUserError(f"Binary {binary} does not exist!")
+
+    def _cleanup_shm_resources(self) -> None:
+        """Remove any existing shared memory segments and semaphores from /dev/shm."""
+        try:
+            # Collect potential shared memory and semaphore names based on node names
+            shm_patterns = []
+            # Pattern for shared memory segments (e.g., /nodename_0, /nodename_1)
+            shm_patterns.append("/dev/shm/*")
+
+            removed_count = 0
+            for pattern in shm_patterns:
+                for filepath in glob.glob(pattern):
+                    try:
+                        Path(filepath).unlink()
+                        removed_count += 1
+                    except (FileNotFoundError, PermissionError):  # noqa: PERF203
+                        # File might already be removed or we don't have permission
+                        pass
+
+            if removed_count > 0:
+                log.info(f"Cleaned up {removed_count} existing shared memory resources")
+        except Exception as e:
+            # Don't fail if cleanup fails - just log it
+            self.console.log(f"Warning: Error during shared memory cleanup: {e}")
+
+    def run(
+        self,
+        depth: list[list[int]] | None = None,
+        output_json: Path | None = None,
+        max_cycles: int | None = None,
+        fifo_first_valid_cycles: list[list[int]] | None = None,
+    ) -> dict[str, list[int]]:
+        """Run the simulation entirely with the given depth and sample count.
+
+        Args:
+            depth: FIFO depth to configure for simulations.
+            samples: Number of samples to simulate.
+            output_json: Optional path to write merged simulation data as JSON.
+            max_cycles: Max cycles
+            fifo_first_valid_cycles: First valid cycle for each FIFO (used for timeout detection)
+
+        Returns:
+            Dictionary mapping simulation names to their FIFO utilization arrays.
+        """
+        futures: list[Future] = []
+        fifo_results: dict[str, list[int]] = {}
+        cycles_results: dict[str, int] = {}
+        samples_results: dict[str, int] = {}
+        intervals_results: dict[str, list[int]] = {}
+        timeout_result = False
+        fifo_depths: dict[str, list[int]] = {}
+        fifo_cycles_until_first_valid_results: dict[str, list[int]] = {}
+
+        # Clean up any existing shared memory resources before starting
+        self._cleanup_shm_resources()
+
+        # Initialize barrier for all simulations to synchronize after configuration
+        self.sync_barrier = Barrier(len(self.names))
+
+        if self.progress is not None:
+            self.progress.start()
+        try:
+            with ThreadPoolExecutor(self.workers) as pool:
+                for i, (name, binary) in enumerate(zip(self.names, self.binaries, strict=True)):
+                    is_last_node = i == len(self.names) - 1
+                    is_special_for_display = i == 0 or is_last_node
+                    futures.append(
+                        pool.submit(
+                            self._run_binary,
+                            binary,
+                            name,
+                            i % len(os.sched_getaffinity(0))
+                            if len(os.sched_getaffinity(0)) < len(self.names)
+                            else -1,  # sched_getaffinity needed, because
+                            # cpu_count does not handle well with workload schedulers.
+                            # We only pin the core if we have more simulations than cores to avoid
+                            # simulations moving around too much and hurting performance. If we have
+                            # more cores than simulations, we leave it to the OS to schedule.
+                            depth[i] if depth is not None else None,
+                            is_last_node,  # Only last node has no output FIFOs
+                            is_special_for_display,  # First and last get special coloring
+                            max_cycles,
+                            fifo_first_valid_cycles[i]
+                            if fifo_first_valid_cycles is not None
+                            else None,
+                        )
+                    )
+
+                # Wait for first completion or error
+                from concurrent.futures import FIRST_COMPLETED, wait
+
+                all_futures = list(futures)  # Keep track of all futures
+                while futures:
+                    done, futures = wait(futures, return_when=FIRST_COMPLETED)
+
+                    # Check if any completed task indicates we should stop
+                    for future in done:
+                        try:
+                            result = future.result()  # This will raise if there was an exception
+                            if result is not None:
+                                (
+                                    sim_name,
+                                    fifo_util,
+                                    cycles,
+                                    samps,
+                                    intervals,
+                                    timeout,
+                                    fifo_depth,
+                                    fifo_cycles_until_first_valid,
+                                ) = result
+                                fifo_depths[sim_name] = fifo_depth
+                                fifo_results[sim_name] = fifo_util
+                                cycles_results[sim_name] = cycles
+                                samples_results[sim_name] = samps
+                                intervals_results[sim_name] = intervals
+                                fifo_cycles_until_first_valid_results[sim_name] = (
+                                    fifo_cycles_until_first_valid
+                                )
+                                timeout_result = timeout_result or timeout
+                        except Exception as e:  # noqa
+                            self.console.log(f"Simulation failed: {e}")
+                            # Set stop flag and break
+                            with self.stop_lock:
+                                self.should_stop = True
+                            break
+
+                    # If we should stop, signal all remaining simulations
+                    with self.stop_lock:
+                        if self.should_stop:
+                            # Don't cancel - let them finish with early stop
+                            break
+
+                # Wait for all futures to complete and collect their results
+                pool.shutdown(wait=True)
+                for future in all_futures:
+                    if not future.done():
+                        continue
+                    try:
+                        result = future.result()
+                        if result is not None:
+                            (
+                                sim_name,
+                                fifo_util,
+                                cycles,
+                                samps,
+                                intervals,
+                                timeout,
+                                fifo_depth,
+                                fifo_cycles_until_first_valid,
+                            ) = result
+                            # Only update if not already collected
+                            if sim_name not in fifo_results:
+                                fifo_cycles_until_first_valid_results[sim_name] = (
+                                    fifo_cycles_until_first_valid
+                                )
+                                fifo_depths[sim_name] = fifo_depth
+                                fifo_results[sim_name] = fifo_util
+                                cycles_results[sim_name] = cycles
+                                samples_results[sim_name] = samps
+                                intervals_results[sim_name] = intervals
+                                timeout_result = timeout_result or timeout
+                    except Exception as e:
+                        self.console.log(f"Error collecting result: {e}")
+
+                # Detect nodes whose _run_binary returned None (subprocess
+                # crash / unhandled exception).  Their names were never inserted into
+                # fifo_results, so the merged JSON would contain empty 'intervals' lists
+                # for those nodes.  _check_performance would then silently return False
+                # (no degradation detected) and the minimisation algorithm would treat a
+                # failed simulation as a successful one.  Mark the run as timed-out so
+                # that _test_depth correctly rejects the candidate depth.
+                missing_nodes = [name for name in self.names if name not in fifo_results]
+                if missing_nodes:
+                    self.console.log(
+                        f"[bold red]WARNING: simulation results missing for node(s) "
+                        f"{missing_nodes} (subprocess likely crashed). "
+                        f"Marking run as timed-out to prevent false-success "
+                        f"classification.[/bold red]"
+                    )
+                    timeout_result = True
+        finally:
+            if self.progress is not None:
+                self.progress.stop()
+            self._cleanup_sockets()
+
+        # Merge all simulation data
+        if output_json is not None:
+            merged_data = {
+                "simulations": [
+                    {
+                        "name": name,
+                        "fifo_utilization": fifo_results.get(name, []),
+                        "fifo_depth": fifo_depths.get(name, []),
+                        "cycles": cycles_results.get(name, 0),
+                        "samples": samples_results.get(name, 0),
+                        "intervals": intervals_results.get(name, []),
+                        "fifo_cycles_until_first_valid": fifo_cycles_until_first_valid_results.get(
+                            name, []
+                        ),
+                    }
+                    for name in self.names
+                ],
+                "depth_configured": depth,
+                "timeout_occurred": timeout_result,
+            }
+            output_json.write_text(json.dumps(merged_data, indent=2))
+
+        return fifo_results
+
+    def _run_binary(
+        self,
+        binary: Path,
+        name: str | None,
+        _cpu: int | None,
+        depth: list[int] | None = None,
+        is_last_node: bool = False,
+        is_special_for_display: bool = False,
+        max_cycles: int | None = None,
+        fifo_first_valid_cycles: list[int] | None = None,
+    ) -> tuple[str, list[int], int, int, list[int], bool, list[int], list[int]] | None:
+        """Run the specified simulation binary in a new subprocess and communicate with it.
+
+        Args:
+            binary: Path to simulation binary
+            name: Name of simulation node
+            _cpu: CPU affinity (unused)
+            depth: List of FIFO depths for this node's output FIFOs
+            is_last_node: True if this is the last node (no output FIFOs to configure)
+            is_special_for_display: True if this node should get special color in logs
+            max_cycles: Maximum cycles to simulate
+            fifo_first_valid_cycles: First valid cycle for each FIFO (used for timeout detection)
+
+        Returns:
+            Tuple of (simulation_name, fifo_utilization, cycles, samples, intervals, timeout,
+            fifo_depth, fifo_cycles_until_first_valid) on success,
+            None on failure.
+        """
+        cwd = binary.parent
+        if name is None:
+            name = cwd.name.replace("rtlsim_", "")
+
+        process_index = self.names.index(name)
+
+        with (self.logdir / f"{name}_{process_index}_of_{self.total}.txt").open("w+") as logfile:
+
+            def _print(msg: str, color: str = "green") -> None:
+                if self.progress is None:
+                    if is_special_for_display:
+                        color = "orange3"
+                    if "ERROR" in msg:
+                        color = "red"
+                    log.debug(
+                        f"[bold {color}]{name:<35}"
+                        f"[/bold {color}][cornflower_blue]{process_index} "
+                        f"/ {len(self.names) - 1}[/cornflower_blue] {msg:<35}"
+                    )
+                logfile.write(f"{msg}\n")
+                logfile.flush()
+
+            try:
+                # Start the simulation process with socket communication
+                proc_idx = self._start_process(
+                    binary, process_index, cpu=_cpu if _cpu is not None else -1
+                )
+
+                # Send configuration commands
+                # Last node has no output FIFOs, so don't configure FIFO depths
+                config_payload: dict[str, list[int] | int] = {}
+                if not is_last_node and depth is not None:
+                    config_payload["fifo_depth"] = depth
+                if max_cycles is not None:
+                    config_payload["max_cycles"] = max_cycles
+                if not is_last_node and fifo_first_valid_cycles is not None:
+                    config_payload["fifo_first_valid_cycles"] = fifo_first_valid_cycles
+
+                response = self._send_and_receive(proc_idx, "configure", config_payload)
+
+                if not response or response.get("status") != "success":
+                    error_msg = (
+                        response.get("message", "Unknown error") if response else "No response"
+                    )
+                    _print(f"Configuration failed: {error_msg}", "red")
+                    return None
+
+                # Wait for all simulations to complete configuration before starting
+                _print("Waiting for all simulations to complete configuration...")
+                if self.sync_barrier is not None:
+                    self.sync_barrier.wait()
+                _print("All simulations configured, starting...")
+
+                # Start the simulation
+                response = self._send_and_receive(proc_idx, "start", {})
+
+                if not response or response.get("status") != "success":
+                    error_msg = (
+                        response.get("message", "Unknown error") if response else "No response"
+                    )
+                    _print(f"Failed to start simulation: {error_msg}", "red")
+                    return None
+
+                cycles = 0
+                samps = 0
+                intervals: list[int] = []
+                timeout = False
+                fifo_util: list[int] = []
+                fifo_depth: list[int] = []
+                fifo_cycles_until_first_valid: list[int] = []
+
+                # Poll for status updates
+                while True:
+                    # Check if we should stop early
+                    with self.stop_lock:
+                        if self.should_stop:
+                            try:
+                                stop_response = self._send_and_receive(proc_idx, "stop", {})
+                            except (BrokenPipeError, ConnectionResetError, RuntimeError):
+                                # Process may have already exited - that's ok during shutdown
+                                stop_response = None
+                            if stop_response:
+                                cycles = stop_response.get("cycles", 0)
+                                samps = stop_response.get("samples", 0)
+                                fifo_util = stop_response.get("fifo_utilization", [])
+                                intervals = stop_response.get("intervals", [])
+                                fifo_depth = stop_response.get("fifo_depth", [])
+                                timeout = stop_response.get("timeout", False)
+                                fifo_cycles_until_first_valid = stop_response.get(
+                                    "fifo_cycles_until_first_valid", []
+                                )
+                                if fifo_util:
+                                    logfile.write(f"Final FIFO utilization: {fifo_util}\n")
+                            return (
+                                name,
+                                fifo_util,
+                                cycles,
+                                samps,
+                                intervals,
+                                timeout,
+                                fifo_depth,
+                                fifo_cycles_until_first_valid,
+                            )
+                    time.sleep(self.poll_interval)
+
+                    response = self._send_and_receive(proc_idx, "status", {})
+
+                    if not response:
+                        _print("Lost connection to simulation", "red")
+                        with self.stop_lock:
+                            self.should_stop = True
+                        raise RuntimeError("Lost connection to simulation")
+
+                    state = response.get("state", "unknown")
+
+                    if state == "finished" or state == "timeout":
+                        cycles = response.get("cycles", 0)
+                        samps = response.get("samples", 0)
+                        fifo_util = response.get("fifo_utilization", [])
+                        fifo_depth = response.get("fifo_depth", [])
+                        intervals = response.get("intervals", [])
+                        timeout = response.get("timeout", False)
+                        fifo_cycles_until_first_valid = response.get(
+                            "fifo_cycles_until_first_valid", []
+                        )
+                        with self.stop_lock:
+                            self.should_stop = True
+                        break
+
+                    if state == "running":
+                        # Update progress if available
+                        cycles = response.get("cycles", 0)
+
+                    if state == "error":
+                        error_msg = response.get("message", "Unknown error")
+                        _print(f"Simulation error: {error_msg}", "red")
+                        # Signal other simulations to stop
+                        with self.stop_lock:
+                            self.should_stop = True
+                        raise RuntimeError(f"Simulation error: {error_msg}")
+
+                # Stop the simulation
+                stop_response = self._send_and_receive(proc_idx, "stop", {})
+
+                if stop_response:
+                    fifo_util = stop_response.get("fifo_utilization", [])
+                    fifo_depth = stop_response.get("fifo_depth", [])
+                    cycles = stop_response.get("cycles", 0)
+                    samps = stop_response.get("samples", 0)
+                    fifo_cycles_until_first_valid = stop_response.get(
+                        "fifo_cycles_until_first_valid", []
+                    )
+                    if fifo_util:
+                        logfile.write(f"Final FIFO utilization: {fifo_util}\n")
+
+                return (
+                    name,
+                    fifo_util,
+                    cycles,
+                    samps,
+                    intervals,
+                    timeout,
+                    fifo_depth,
+                    fifo_cycles_until_first_valid,
+                )
+
+            except Exception as e:
+                self.console.log(f"Exception caught during simulation execution ({name}): {e}")
+                self.console.log(traceback.format_exc())
+                logfile.write(f"Exception: {e}\n")
+                logfile.write(traceback.format_exc())
+                with self.stop_lock:
+                    self.should_stop = True
+                return None
+
+
+class NodeConnectedSimulation(Simulation):
+    def __init__(
+        self,
+        model: ModelWrapper,
+        simulation_type: SimulationType,
+        fpgapart: str,
+        clk_ns: float,
+        functional_sim: bool,
+        workers: int | None = None,
+        max_qsrl_depth: int = 256,
+    ) -> None:
+        super().__init__(model, simulation_type, fpgapart, clk_ns, functional_sim, workers)
+        self.max_qsrl_depth = max_qsrl_depth
+
+    def simulate(
+        self,
+        depth: int | list[list[int]] | None = None,
+        max_cycles: int | None = None,
+        fifo_first_valid_cycles: list[list[int]] | None = None,
+    ) -> tuple[list[dict[str, list[int]]], bool]:
+        """Simulate the given number of samples for every layer. Layers are completely isolated
+        and simulated in parallel.
+        Simulation data is returned as a list of dicts (by node name as index).
+        """
+        if self.simulation_type != SimulationType.NODE_BASED_CONNECTED:
+            raise FINNInternalError(
+                f"Called simulation function 'simulate_node_connected' "
+                f"does not match provided simulation type "
+                f"{self.simulation_type}"
+            )
+        names = [node.name for node in self.model.graph.node]
+        initial_depth: Any = [[depth]] * len(self.binaries) if isinstance(depth, int) else depth
+
+        # For BRAM FIFOs (depth > max_qsrl_depth), hardware loses BRAM_FIFO_PIPELINE_OVERHEAD
+        # entries to internal pipeline registers *per BRAM sub-FIFO*.  Non-power-of-two depths
+        # are decomposed into several power-of-two sub-FIFOs (see get_fifo_split_configs), so
+        # the total overhead is num_bram_sub_fifos * BRAM_FIFO_PIPELINE_OVERHEAD.
+        # Rounding to a full BRAM block before calling get_fifo_split_configs is NOT needed:
+        # the decomposition works on any depth, and we want the sub-FIFO count for the exact
+        # depth under test.
+        if initial_depth is not None and not isinstance(initial_depth, int):
+            adjusted_depth: Any = [
+                [
+                    d - _count_bram_sub_fifos(d, self.max_qsrl_depth) * BRAM_FIFO_PIPELINE_OVERHEAD
+                    if d > self.max_qsrl_depth
+                    else d
+                    for d in node_depths
+                ]
+                for node_depths in initial_depth
+            ]
+        else:
+            adjusted_depth = initial_depth
+
+        # Run simulation
+        start = time.time()
+        output_json = Path(make_build_dir("simulation_results_")) / "simulation_data.json"
+        controller = NodeConnectedSimulationController(
+            len(self.binaries), names, list(self.binaries.values()), Console(), 0.1, False
+        )
+        controller.run(adjusted_depth, output_json, max_cycles, fifo_first_valid_cycles)
+        end = time.time()
+        log.debug(f"Simulation took {end - start} seconds!")
+
+        # Load the merged data from JSON
+        merged_data = json.loads(output_json.read_text())
+
+        # Return the collected data indexed by node index
+        data = []
+        for sim_entry in merged_data["simulations"]:
+            data.append(
+                {
+                    "name": sim_entry["name"],
+                    "fifo_utilization": sim_entry["fifo_utilization"],
+                    "fifo_depth": sim_entry["fifo_depth"],
+                    "cycles": sim_entry["cycles"],
+                    "samples": sim_entry["samples"],
+                    "intervals": sim_entry["intervals"],
+                    "fifo_cycles_until_first_valid": sim_entry["fifo_cycles_until_first_valid"],
+                }
+            )
+        json.dump(data, output_json.open("w"), indent=4)
+        return data, merged_data.get("timeout_occurred", False)
+
+
+class RunLayerParallelSimulation(Transformation):  # noqa
+    def __init__(
+        self,
+        fpgapart: str,
+        clk_ns: float,
+        cfg: DataflowBuildConfig,
+        minimization_orders: list[MinimizationOrder] | None = None,
+        max_qsrl_depth: int = 256,
+        vivado_ram_style: str = "auto",
+        quality_of_results: str = "default",
+    ) -> None:
+        """Run layer parallel simulations."""
+        super().__init__()
+        self.fpgapart = fpgapart
+        self.clk_ns = clk_ns
+        self.cfg = cfg
+        self.max_qsrl_depth = max_qsrl_depth
+        self.vivado_ram_style = vivado_ram_style
+        self.quality_of_results = quality_of_results
+        if minimization_orders is not None:
+            self.minimization_orders = minimization_orders
+        else:
+            # TODO: Set to ALL search orders
+            self.minimization_orders = [MinimizationOrder.NODE_ORDER]
+
+        self.final_depths: dict[MinimizationOrder, list[list[int]] | None] = dict.fromkeys(
+            self.minimization_orders
+        )
+
+    def create_starting_fifo_depths(
+        self, initial_fifo_depths: list[dict[str, list[int]]]
+    ) -> tuple[list[list[int]], list[list[int]]]:
+        """From the given initial_fifo_depths returned by the simulation, create a starting
+        FIFO depth configuration that can be modified sequentially by the minimization algorithm.
+        Also return the fifo_first_valid_cycles.
+        """
+        # Create fifo_depths (indexed by layer index and then stream index)
+        fifo_depths: list[list[int]] = []  # Each entry is a list of fifo sizes for that node
+        for val in initial_fifo_depths:
+            # Use _safe_bram_starting_depth so that simulate() (which subtracts
+            # num_sub_fifos*BRAM_FIFO_PIPELINE_OVERHEAD for BRAM depths) still sees a depth
+            # that covers the observed peak utilisation.  A flat +2 is insufficient when a
+            # depth decomposes into multiple BRAM sub-FIFOs (e.g. depth 1537 → 2 sub-FIFOs
+            # → 4 entries of overhead).
+            fifo_depths.append(
+                [_safe_bram_starting_depth(v, self.max_qsrl_depth) for v in val["fifo_utilization"]]
+            )
+        fifo_first_valid_cycles: list[list[int]] = []
+        for val in initial_fifo_depths:
+            fifo_first_valid_cycles.append(
+                [v + math.ceil(v * 0.01) for v in val["fifo_cycles_until_first_valid"]]
+            )  # Add 1% cycles grace period
+        return fifo_depths, fifo_first_valid_cycles
+
+    def get_minimization_order_indices(
+        self,
+        min_order: MinimizationOrder,
+        model: ModelWrapper,
+        bitwidths: list[int],
+    ) -> list[int]:
+        """Given a MinimizationOrder, return the list of indices to
+        access/minimize `fifo_depths` for that order. For example, NODE_ORDER would return
+        [0,1,2,...] and NODE_ORDER_REVERSED [N, N-1, N-2, ..., 0].
+        """
+        assert len(model.graph.node) == len(bitwidths)
+        match min_order:
+            case MinimizationOrder.NODE_ORDER:
+                return list(range(len(model.graph.node)))
+            case MinimizationOrder.REVERSE_NODE_ORDER:
+                return list(range(len(model.graph.node)))[::-1]
+            case (
+                MinimizationOrder.LARGEST_BITWIDTH_DIFF_FIRST
+                | MinimizationOrder.SMALLEST_BITWIDTH_DIFF_FIRST
+            ):
+                diffs: list[tuple[int, int]] = []  # (index, diff)
+                for i in range(len(model.graph.node)):
+                    hw: HWCustomOp = getCustomOp(model.graph.node[i])
+                    in_width = max(
+                        [hw.get_instream_width(j) for j in range(len(model.graph.node[i].input))]
+                    )
+                    out_width = max(
+                        [hw.get_outstream_width(j) for j in range(len(model.graph.node[i].output))]
+                    )
+                    diffs.append((i, in_width - out_width))
+                sorted_order = sorted(
+                    diffs,
+                    key=lambda x: x[1],
+                    reverse=(min_order == MinimizationOrder.LARGEST_BITWIDTH_DIFF_FIRST),
+                )
+                return [idx for idx, diff in sorted_order]
+            case _:
+                raise NotImplementedError()
+
+    def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, bool]:
+        """Run layer parallel simulations."""
+        sim = NodeConnectedSimulation(
+            model,
+            SimulationType.NODE_BASED_CONNECTED,
+            self.fpgapart,
+            self.clk_ns,
+            self.cfg.functional_simulation,
+            max_qsrl_depth=self.max_qsrl_depth,
+        )
+        model = sim.model  # TODO:clean up
+
+        # Create empty table for datapoints that will be collected
+        # First create as a nested dict, since not all data is avilable at the same time
+        # It is then flattened when creating the dataframe, so that node and stream are columns too
+        # df_data[node][stream_idx][columnm] = ...
+        df_data: dict[str, list[dict[str, Any]]] = {}
+        for nodeindex, node in enumerate(model.graph.node):
+            df_data[node.name] = []
+            for node_idx in range(len(node.output)):
+                df_data[node.name].append(
+                    {
+                        "onnx_index": nodeindex,
+                        "out_bitwidth": -1,
+                        "out_initial_fifo_depths": -1,
+                        "fifo_cycles_until_first_valid": -1,
+                        "successor_node": ", ".join(
+                            [node.name for node in model.find_consumers(node.output[node_idx])]
+                        ),
+                    }
+                )
+                for min_order in self.minimization_orders:
+                    df_data[node.name][-1][f"out_final_depth_{min_order.name}"] = -1
+                    df_data[node.name][-1][f"simulation_time_{min_order.name}"] = -1
+                    df_data[node.name][-1][f"minimization_iterations_{min_order.name}"] = -1
+
+        # TODO: The final depths contained a lot of -1 (default values).
+        # Did we need to write the initial depths into there?
+        # Or in case of minimization skip we likely need to write the values still.
+
+        # Running the initial simulation
+        log.info("Running initial node-connected simulation.")
+        initial_fifo_depths, _ = sim.simulate()
+
+        # Store the initial sizes as a report
+        initial_sizes_path = (
+            Path(self.cfg.output_dir) / "report" / "initial_fifo_sizes_sim_connected.json"
+        )
+        initial_sizes_path.write_text(json.dumps(initial_fifo_depths, indent=4))
+        log.info(f"Wrote initial sizes to: {initial_sizes_path}")
+
+        # Store initial sizes in dataframe as well
+        for layerdata in initial_fifo_depths:
+            for idx in range(len(layerdata["fifo_utilization"])):
+                name: str = cast("str", layerdata["name"])
+                df_data[name][idx]["out_initial_fifo_depths"] = layerdata["fifo_utilization"][idx]
+                df_data[name][idx]["fifo_cycles_until_first_valid"] = layerdata[
+                    "fifo_cycles_until_first_valid"
+                ][idx]
+
+        # List of list of fifo depths
+        fifo_depths, fifo_first_valid_cycles = self.create_starting_fifo_depths(initial_fifo_depths)
+
+        # Max cycles for any simulation
+        sim_cycles: int = cast("int", max([val["cycles"] for val in initial_fifo_depths]))
+
+        # Extract bitwidths from outstream widths of hw nodes
+        bit_widths = []
+        for node_idx in range(len(fifo_depths)):
+            bit_widths.append([])
+            hw_node = getCustomOp(model.graph.node[node_idx])
+            if isinstance(hw_node, HWCustomOp):
+                for fifo_idx in range(len(fifo_depths[node_idx])):
+                    bit_widths[node_idx].append(hw_node.get_outstream_width(fifo_idx))
+            else:
+                raise FINNInternalError("Non-HW node found in dataflow graph during simulation")
+
+        # Store bitwidths into dataframe as well
+        for node_idx in range(len(bit_widths)):
+            for fifo_idx in range(len(bit_widths[node_idx])):
+                df_data[model.graph.node[node_idx].name][fifo_idx]["out_bitwidth"] = bit_widths[
+                    node_idx
+                ][fifo_idx]
+
+        # Run minimization for every layer/stream
+        log.info("Minimizing layers...")
+        needs_minimization = []
+        for node_idx in range(len(fifo_depths)):
+            needs_minimization.append([True] * len(fifo_depths[node_idx]))
+        for node_idx in range(len(fifo_depths)):
+            for fifo_idx in range(len(fifo_depths[node_idx])):
+                # Check if we can reduce the fifo size
+
+                used_size = fifo_depths[node_idx][fifo_idx]
+                bw = bit_widths[node_idx][fifo_idx]
+
+                needs_minimization[node_idx][fifo_idx] = self._needs_minimization(used_size, bw)
+
+        # Total minimizations
+        total_minimizations = sum(len(streams) for streams in fifo_depths)
+
+        for k, minimization_order in enumerate(self.minimization_orders):
+            # Create a new empty FIFO depth list
+            fifo_depths, fifo_first_valid_cycles = self.create_starting_fifo_depths(
+                initial_fifo_depths
+            )
+
+            # Minimize FIFO depths using binary search over BRAM block counts
+            idx_order = self.get_minimization_order_indices(minimization_order, model, bit_widths)
+            if len(idx_order) != len(model.graph.node):
+                raise FINNInternalError(
+                    f"Expected index order length {len(model.graph.node)}, but got {len(idx_order)}"
+                )
+
+            log.info(
+                f"Minimizing using order: {minimization_order.name}. Index order is: {idx_order}"
+            )
+
+            done = 0
+            for node_idx in idx_order:
+                for fifo_idx in range(len(fifo_depths[node_idx])):
+                    if not needs_minimization[node_idx][fifo_idx]:
+                        df_data[model.graph.node[node_idx].name][fifo_idx][
+                            f"simulation_time_{minimization_order.name}"
+                        ] = 0.0
+                        df_data[model.graph.node[node_idx].name][fifo_idx][
+                            f"out_final_depth_{minimization_order.name}"
+                        ] = fifo_depths[node_idx][fifo_idx]
+                        df_data[model.graph.node[node_idx].name][fifo_idx][
+                            f"minimization_iterations_{minimization_order.name}"
+                        ] = 0
+                        log.info(
+                            f"[ {node_idx}.{fifo_idx} / {len(fifo_depths) - 1} ] "
+                            f"Skipping minimization for this stream."
+                        )
+                        done += 1
+                        continue
+
+                    minimization_start = time.time()
+                    minimized_depth, iterations_needed = self._minimize_fifo_depth(
+                        node_idx,
+                        fifo_idx,
+                        fifo_depths,  # current_depths: evolves as FIFOs are minimised
+                        bit_widths,
+                        initial_fifo_depths,
+                        sim,
+                        sim_cycles,
+                        fifo_first_valid_cycles,
+                    )
+                    minimization_time = time.time() - minimization_start
+
+                    # Store the minimized size
+                    fifo_depths[node_idx][fifo_idx] = minimized_depth
+                    done += 1
+
+                    # Store data into dataframe
+                    df_data[model.graph.node[node_idx].name][fifo_idx][
+                        f"simulation_time_{minimization_order.name}"
+                    ] = minimization_time
+                    df_data[model.graph.node[node_idx].name][fifo_idx][
+                        f"minimization_iterations_{minimization_order.name}"
+                    ] = iterations_needed
+                    df_data[model.graph.node[node_idx].name][fifo_idx][
+                        f"out_final_depth_{minimization_order.name}"
+                    ] = fifo_depths[node_idx][fifo_idx]
+                    log.debug(
+                        f"Set node/stream {node_idx}.{fifo_idx} to "
+                        f"depth {fifo_depths[node_idx][fifo_idx]}, in "
+                        f"{iterations_needed} iterations and {minimization_time} "
+                        f"seconds. (To {minimization_order.name})"
+                    )
+
+                    percentage = int(100.0 * float(done) / float(total_minimizations))
+                    log.info(
+                        f"[ [bold green]{percentage}%[/bold green] ] "
+                        f"[ {node_idx}.{fifo_idx} / {len(fifo_depths) - 1} ] Simulation completed "
+                        f"({iterations_needed} iterations).",
+                        extra={"markup": True, "highlighter": None},
+                    )
+
+            self.final_depths[minimization_order] = deepcopy(fifo_depths)
+
+            order_percent = int(100.0 * float(k + 1) / float(len(self.minimization_orders)))
+            log.info(
+                f"[ [bold gold1]{order_percent}%[/bold gold1] ] "
+                f"-----  Minimization order {minimization_order.name} completed -----",
+                extra={"markup": True, "highlighter": None},
+            )
+
+        # Store dataframe
+        df_keys = list(df_data[model.graph.node[0].name][0].keys())
+        log.debug(f"Saving keys: {df_keys} + [node, stream]")
+        df_dict = {}
+        df_dict["node"] = []
+        df_dict["stream"] = []
+        for k in df_keys:
+            df_dict[k] = []
+        for node, nodedata in df_data.items():
+            for streamindex, streamdata in enumerate(nodedata):
+                df_dict["node"].append(node)
+                df_dict["stream"].append(streamindex)
+                for key in streamdata.keys():
+                    df_dict[key].append(streamdata[key])
+
+        df = pd.DataFrame(df_dict)
+        model = store_fifo_data(
+            model,
+            df,
+            Path(self.cfg.output_dir) / "report" / "fifo_data.csv",
+            delete_existing=False,
+            store_html=True,
+        )
+
+        # Use the smallest fifo depths found (by total bytes)
+        smallest_order = self.minimization_orders[0]
+        smallest_size = None
+        for order in self.minimization_orders:
+            current_size = 0
+            depths = self.final_depths[order]
+            if depths is None:
+                raise FINNInternalError(
+                    f"Expected FIFO sizes for minimization order {order.name}, but found None."
+                )
+            for node_idx in range(len(depths)):
+                for fifo_idx in range(len(depths[node_idx])):
+                    current_size += depths[node_idx][fifo_idx] * bit_widths[node_idx][fifo_idx]
+
+            if smallest_size is None or current_size < smallest_size:
+                smallest_size = current_size
+                smallest_order = order
+
+        # Set the result fifo depths
+        fifo_depths = self.final_depths[smallest_order]
+        assert fifo_depths is not None
+
+        # Make sure that all FIFOs with depth > 256 use a full BRAM block,
+        # since partial blocks are not supported by Vivado HLS
+        for node_idx in range(len(fifo_depths)):
+            for fifo_idx in range(len(fifo_depths[node_idx])):
+                if fifo_depths[node_idx][fifo_idx] > self.max_qsrl_depth:
+                    bw = bit_widths[node_idx][fifo_idx]
+                    blocks = calculate_bram_blocks(fifo_depths[node_idx][fifo_idx], bw)
+                    # if len(fifo_depths[i]) > 1:
+                    #     blocks_plus_one = self._get_valid_block_counts(
+                    #         blocks + 1, blocks + 1000, bw
+                    #     )
+                    #     _, max_d = calculate_bram_depth_range(blocks_plus_one[0], bw)
+                    # else:
+                    _, max_d = calculate_bram_depth_range(blocks, bw)
+                    fifo_depths[node_idx][fifo_idx] = max_d
+
+        log.info("Final FIFO depths:")
+        for node_idx in range(len(fifo_depths)):
+            log.info(f"{node_idx}: {fifo_depths[node_idx]}")
+
+        log.info("Running final end-to-end validation simulation with minimised FIFO depths...")
+        validation_data, validation_timeout = sim.simulate(
+            fifo_depths,
+            max_cycles=math.ceil(sim_cycles * 1.05),
+            fifo_first_valid_cycles=fifo_first_valid_cycles,
+        )
+        if validation_timeout:
+            raise FINNUserError(
+                "Final validation simulation timed out with the jointly-minimised FIFO depths. "
+                "The per-FIFO minimisation may have produced a configuration that is "
+                "collectively too small.  Re-run with a larger initial depth or fewer "
+                "minimisation orders."
+            )
+        if self._check_performance(validation_data, initial_fifo_depths):
+            raise FINNUserError(
+                "Final validation simulation detected throughput degradation with the "
+                "jointly-minimised FIFO depths (intervals exceeded baseline). "
+                "The per-FIFO minimisation may have produced a configuration that is "
+                "collectively too small.  Re-run with a larger initial depth or fewer "
+                "minimisation orders."
+            )
+        log.info("Final validation simulation passed - minimised depths are correct.")
+
+        # Write back results. By default write to output_dir / "fifo_config.json"
+        writeback_path = Path(self.cfg.output_dir) / "fifo_config.json"
+        assert len(fifo_depths) == len(model.graph.node)
+        json_results = []
+        for node_idx, node in enumerate(model.graph.node):
+            json_results.append({"node": node.name, "depths": fifo_depths[node_idx]})
+        with writeback_path.open("w") as f:
+            json.dump(json_results, f)
+        log.info(f"Wrote results back to {writeback_path}")
+
+        return model, False
+
+    def _check_performance(
+        self, new_data: list[dict[str, list[int]]], initial_fifo_depths: list[dict[str, list[int]]]
+    ) -> bool:
+        """Check if performance has degraded compared to baseline.
+
+        Args:
+            new_data: Simulation results to check
+            initial_fifo_depths: Baseline performance data
+
+        Returns:
+            True if performance degraded, False otherwise
+        """
+        for new, initial in zip(new_data, initial_fifo_depths, strict=True):
+            if len(new["intervals"]) != len(initial["intervals"]):
+                raise FINNInternalError(
+                    "New simulation data has different number of streams than baseline."
+                )
+            for idx in range(len(new["intervals"])):
+                if new["intervals"][idx] > initial["intervals"][idx]:
+                    return True
+        return False
+
+    def _test_depth(
+        self,
+        test_depth: int,
+        node_idx: int,
+        fifo_idx: int,
+        current_depths: list[list[int]],
+        initial_fifo_depths: list[dict[str, list[int]]],
+        sim: NodeConnectedSimulation,
+        sim_cycles: float,
+        fifo_first_valid_cycles: list[list[int]],
+    ) -> tuple[bool, bool]:
+        """Test a specific FIFO depth.
+
+        Args:
+            test_depth: Depth to test
+            node_idx: Node index
+            fifo_idx: FIFO index within node
+            current_depths: Current working FIFO depth configuration.  FIFOs that have
+                already been minimised contain their final minimised depth; FIFOs not yet
+                processed still carry the safe starting depth.  This list is never
+                modified by this method - a deep copy is made before inserting
+                ``test_depth``.
+            initial_fifo_depths: Baseline performance data
+            sim: Simulation controller
+            sim_cycles: Maximum simulation cycles
+            fifo_first_valid_cycles: First valid cycle for each FIFO
+        Returns:
+            Tuple of (success, timeout) where success means depth works without degradation
+        """
+        test_depths = deepcopy(current_depths)
+        test_depths[node_idx][fifo_idx] = test_depth
+
+        new_simulation_data, timeout = sim.simulate(
+            test_depths,
+            max_cycles=min(
+                math.ceil(sim_cycles * 1.05), math.ceil(sim_cycles) + 10 * len(test_depths)
+            ),
+            fifo_first_valid_cycles=fifo_first_valid_cycles,
+        )
+
+        if timeout:
+            return False, True
+
+        performance_degraded = self._check_performance(new_simulation_data, initial_fifo_depths)
+        return not performance_degraded, False
+
+    def _get_valid_block_counts(self, min_blocks: int, max_blocks: int, bitwidth: int) -> list[int]:
+        """Get all valid BRAM block counts in the specified range.
+
+        Some block counts are invalid for certain bitwidths due to quantization.
+        This method returns only the valid configurations.
+
+        Args:
+            min_blocks: Minimum block count (inclusive)
+            max_blocks: Maximum block count (inclusive)
+            bitwidth: Data bitwidth
+
+        Returns:
+            Sorted list of valid block counts
+        """
+        valid_blocks = []
+        for blocks in range(min_blocks, max_blocks + 1):
+            _, max_d = calculate_bram_depth_range(blocks, bitwidth)
+            if max_d > 0:  # Valid configuration
+                valid_blocks.append(blocks)
+        return valid_blocks
+
+    def _minimize_fifo_depth(
+        self,
+        node_idx: int,
+        fifo_idx: int,
+        current_depths: list[list[int]],
+        bit_widths: list[list[int]],
+        initial_fifo_depths: list[dict[str, list[int]]],
+        sim: NodeConnectedSimulation,
+        sim_cycles: int,
+        fifo_first_valid_cycles: list[list[int]],
+    ) -> tuple[int, int]:
+        """Minimize a single FIFO depth using binary search.
+
+        Args:
+            node_idx: Node index
+            fifo_idx: FIFO index within node
+            current_depths: Current working FIFO depth configuration.  FIFOs that have
+                already been minimised in this pass carry their final minimised depth;
+                FIFOs not yet processed still carry the safe starting depth.  This list
+                is mutated by the caller (``apply``) after each call to store the
+                minimised result, so successive calls see the evolving state.
+            bit_widths: Bitwidths for all FIFOs
+            initial_fifo_depths: Baseline performance data
+            sim: Simulation controller
+            sim_cycles: Maximum simulation cycles
+            fifo_first_valid_cycles: First valid cycle for each FIFO
+        Returns:
+            Tuple: Minimized FIFO depth, Iterations required to arrive at the result
+        """
+        iterations = 0
+        original_size = current_depths[node_idx][fifo_idx]
+        bw = bit_widths[node_idx][fifo_idx]
+
+        log.debug(f"Minimizing Node {node_idx} FIFO {fifo_idx}: original depth {original_size}")
+
+        # If FIFO depth of 32 works, use it because it fits into bw/2 LUTs
+        success, timeout = self._test_depth(
+            32,
+            node_idx,
+            fifo_idx,
+            current_depths,
+            initial_fifo_depths,
+            sim,
+            sim_cycles,
+            fifo_first_valid_cycles,
+        )
+        iterations += 1
+        if success:
+            return 32, iterations
+
+        if original_size <= self.max_qsrl_depth:
+            upper_luts = calculate_srl16e_luts(original_size, bw)
+            # LUTRAM based FIFOs have block sizes of 32, so smallest after 32 is 64
+            lower_luts = calculate_srl16e_luts(64, bw)
+
+            # Binary search if there's room to search
+            if upper_luts > lower_luts:
+                best_working_depth, bin_it = self._binary_search_srl_depth(
+                    node_idx,
+                    fifo_idx,
+                    current_depths,
+                    bw,
+                    initial_fifo_depths,
+                    sim,
+                    sim_cycles,
+                    fifo_first_valid_cycles,
+                    lower_luts=lower_luts,
+                    upper_luts=upper_luts,
+                )
+                iterations += bin_it
+                return best_working_depth, iterations
+            return original_size, iterations
+
+        # Try FIFO depth of 256 next (fits into LUTRAM)
+        success, timeout = self._test_depth(
+            self.max_qsrl_depth,
+            node_idx,
+            fifo_idx,
+            current_depths,
+            initial_fifo_depths,
+            sim,
+            sim_cycles,
+            fifo_first_valid_cycles,
+        )
+        iterations += 1
+        if success:
+            upper_luts = calculate_srl16e_luts(self.max_qsrl_depth, bw)
+            # LUTRAM based FIFOs have block sizes of 32, so smallest after 32 is 64
+            lower_luts = calculate_srl16e_luts(64, bw)
+
+            # Binary search if there's room to search
+            if upper_luts > lower_luts:
+                best_working_depth, bin_it = self._binary_search_srl_depth(
+                    node_idx,
+                    fifo_idx,
+                    current_depths,
+                    bw,
+                    initial_fifo_depths,
+                    sim,
+                    sim_cycles,
+                    fifo_first_valid_cycles,
+                    lower_luts=lower_luts,
+                    upper_luts=upper_luts,
+                )
+                iterations += bin_it
+                return best_working_depth, iterations
+            return self.max_qsrl_depth, iterations
+
+        # We know 256 doesn't work, so we have to use BRAMs
+        # Try one BRAM block less than current
+        upper_blocks = calculate_bram_blocks(original_size, bw)
+        # Get all valid block counts in the range
+        valid_blocks = self._get_valid_block_counts(1, upper_blocks - 1, bw)
+        if not valid_blocks:
+            # No valid configurations exist
+            return original_size, iterations
+        # Test the maximum valid block count first
+        # (largest depth below original, most likely to succeed)
+        max_valid_blocks = valid_blocks[-1]
+        _, max_d = calculate_bram_depth_range(max_valid_blocks, bw)
+
+        success, timeout = self._test_depth(
+            max_d,
+            node_idx,
+            fifo_idx,
+            current_depths,
+            initial_fifo_depths,
+            sim,
+            sim_cycles,
+            fifo_first_valid_cycles,
+        )
+        iterations += 1
+
+        if timeout or not success:
+            return original_size, iterations
+
+        best_working_depth = max_d
+
+        # Binary search if there's room to search and multiple valid configs
+        if len(valid_blocks) > 1:
+            best_working_depth, bin_it = self._exponential_binary_search_depth(
+                node_idx,
+                fifo_idx,
+                current_depths,
+                bw,
+                initial_fifo_depths,
+                sim,
+                sim_cycles,
+                fifo_first_valid_cycles,
+                valid_blocks=valid_blocks,
+            )
+            iterations += bin_it
+
+        return best_working_depth, iterations
+
+    def _exponential_binary_search_depth(
+        self,
+        node_idx: int,
+        fifo_idx: int,
+        current_depths: list,
+        bitwidth: int,
+        initial_fifo_depths: list[dict[str, list[int]]],
+        sim: NodeConnectedSimulation,
+        sim_cycles: float,
+        fifo_first_valid_cycles: list[list[int]],
+        valid_blocks: list[int],
+    ) -> tuple[int, int]:
+        """Perform exponential + binary search over valid block configurations.
+
+        Uses exponential search to quickly find the range, then binary search within it.
+        This is more efficient when smaller block counts are more likely.
+        Only searches over pre-validated block counts.
+
+        Args:
+            node_idx: Node index
+            fifo_idx: FIFO index within node
+            current_depths: Current working FIFO depth configuration.  FIFOs already
+                minimised in this pass carry their final depth; this list must not be
+                modified directly (``_test_depth`` deep-copies it before trial edits).
+            bitwidth: Data bitwidth
+            initial_fifo_depths: Baseline performance data
+            sim: Simulation controller
+            sim_cycles: Maximum simulation cycles
+            fifo_first_valid_cycles: First valid cycle for each FIFO
+            valid_blocks: Sorted list of valid block counts to search over
+
+        Returns:
+            Tuple: Best working depth found, Number of iterations required to arrive at this result.
+        """
+        iterations = 0
+        if not valid_blocks:
+            raise FINNInternalError("valid_blocks list cannot be empty")
+
+        # Start with the largest valid block count (known to work from caller)
+        _, max_d = calculate_bram_depth_range(valid_blocks[-1], bitwidth)
+        best_working_depth = max_d
+
+        # Exponential search phase: find range where solution exists
+        # Check positions: 0, 1, 2, 4, 8, ... indices in valid_blocks list
+        lower_idx = 0
+        upper_idx = len(valid_blocks) - 1
+        exp_idx = 0
+        last_failed_idx = -1
+
+        while exp_idx < upper_idx:
+            blocks = valid_blocks[exp_idx]
+            _, max_d = calculate_bram_depth_range(blocks, bitwidth)
+
+            success, _ = self._test_depth(
+                max_d,
+                node_idx,
+                fifo_idx,
+                current_depths,
+                initial_fifo_depths,
+                sim,
+                sim_cycles,
+                fifo_first_valid_cycles,
+            )
+            iterations += 1
+
+            if success:
+                # Found a working depth, now binary search in [last_failed_idx+1, exp_idx]
+                best_working_depth = max_d
+                lower_idx = last_failed_idx + 1
+                upper_idx = exp_idx
+                break
+            # This doesn't work, try exponentially larger index
+            last_failed_idx = exp_idx
+            exp_idx = min(exp_idx * 2 if exp_idx > 0 else 1, upper_idx)
+
+        # Binary search phase: refine the range
+        while lower_idx < upper_idx:
+            mid_idx = (lower_idx + upper_idx) // 2
+            blocks = valid_blocks[mid_idx]
+            _, max_d = calculate_bram_depth_range(blocks, bitwidth)
+
+            success, _ = self._test_depth(
+                max_d,
+                node_idx,
+                fifo_idx,
+                current_depths,
+                initial_fifo_depths,
+                sim,
+                sim_cycles,
+                fifo_first_valid_cycles,
+            )
+            iterations += 1
+
+            if success:
+                # This depth works, try smaller (lower indices)
+                best_working_depth = max_d
+                upper_idx = mid_idx
+            else:
+                # This depth doesn't work, need larger (higher indices)
+                lower_idx = mid_idx + 1
+
+        return best_working_depth, iterations
+
+    def _binary_search_srl_depth(
+        self,
+        node_idx: int,
+        fifo_idx: int,
+        current_depths: list,
+        bitwidth: int,
+        initial_fifo_depths: list[dict[str, list[int]]],
+        sim: NodeConnectedSimulation,
+        sim_cycles: float,
+        fifo_first_valid_cycles: list[list[int]],
+        lower_luts: int,
+        upper_luts: int,
+    ) -> tuple[int, int]:
+        """Perform binary search to find minimal working FIFO depth in LUTRAM range.
+
+        Args:
+            node_idx: Node index
+            fifo_idx: FIFO index within node
+            current_depths: Current working FIFO depth configuration.  FIFOs already
+                minimised in this pass carry their final depth; this list must not be
+                modified directly (``_test_depth`` deep-copies it before trial edits).
+            bitwidth: Data bitwidth
+            initial_fifo_depths: Baseline performance data
+            sim: Simulation controller
+            sim_cycles: Maximum simulation cycles
+            fifo_first_valid_cycles: First valid cycle for each FIFO
+            lower_luts: Lower bound for LUT count
+            upper_luts: Upper bound for LUT count (known to work)
+
+        Returns:
+            Tuple: Best working depth found, Number of Iterations required to arrive at this result
+        """
+        iterations = 0
+        _, max_d = calculate_srl16e_depth_range(upper_luts, bitwidth)
+        best_working_depth = max_d
+
+        while lower_luts < upper_luts:
+            mid_luts = (lower_luts + upper_luts) // 2
+
+            # Prevent infinite loop
+            if mid_luts == upper_luts:
+                mid_luts = upper_luts - 1
+            if mid_luts < lower_luts:
+                break
+
+            # Find valid depth for this LUT count
+            _, max_d = calculate_srl16e_depth_range(mid_luts, bitwidth)
+
+            if max_d == 0:
+                # No valid configuration, try more LUTs
+                lower_luts = mid_luts + 1
+                continue
+
+            success, _ = self._test_depth(
+                max_d,
+                node_idx,
+                fifo_idx,
+                current_depths,
+                initial_fifo_depths,
+                sim,
+                sim_cycles,
+                fifo_first_valid_cycles,
+            )
+            iterations += 1
+
+            if success:
+                # This depth works, try smaller
+                best_working_depth = max_d
+                upper_luts = mid_luts
+            else:
+                # This depth doesn't work, need larger
+                lower_luts = mid_luts + 1
+
+        return best_working_depth, iterations
+
+    def _needs_minimization(self, fifo_depth: int, bitwidth: int) -> bool:
+        """Determine whether a FIFO can be minimized further.
+
+        Args:
+            fifo_depth: Current FIFO depth
+            bitwidth: Data bitwidth
+
+        Returns:
+            True if the FIFO can be minimized further, False otherwise.
+        """
+        # Qsrl FIFO Formula: LUTs = ⌈depth/32⌉ x ⌈bitwidth/2⌉
+        if fifo_depth <= 32:  # FIFOs of depth <=32 fit into bitwidth/2 LUTs
+            return False
+        # Return False if exactly the minimum number of possible BRAM blocks is used for this
+        # bitwidth and depth is sufficiently large that further optimization is unlikely to succeed
+        return not (
+            calculate_bram_blocks(fifo_depth, bitwidth)
+            <= self._get_valid_block_counts(1, bitwidth, bitwidth)[0]
+            and fifo_depth > math.floor(self.max_qsrl_depth * 1.1)
+        )
+
+
+def calculate_bram_blocks(depth: int, bitwidth: int) -> int:
+    """Calculate the number of BRAM blocks required for a BRAM FIFO.
+
+    Args:
+        depth: FIFO depth
+        bitwidth: Data bitwidth
+    """
+    if bitwidth == 1:
+        return math.ceil(depth / 16384)
+    if bitwidth == 2:
+        return math.ceil(depth / 8192)
+    if bitwidth <= 4:
+        return (math.ceil(depth / 4096)) * (math.ceil(bitwidth / 4))
+    if bitwidth <= 9:
+        return (math.ceil(depth / 2048)) * (math.ceil(bitwidth / 9))
+    if bitwidth <= 18 or depth > 512:
+        return (math.ceil(depth / 1024)) * (math.ceil(bitwidth / 18))
+    return (math.ceil(depth / 512)) * (math.ceil(bitwidth / 36))
+
+
+def calculate_bram_depth_range(blocks: int, bitwidth: int) -> tuple[int, int]:
+    """Calculate the range of FIFO depths that use exactly the given number of BRAM blocks.
+
+    Args:
+        blocks: Number of BRAM blocks
+        bitwidth: Data bitwidth
+
+    Returns:
+        Tuple of (min_depth, max_depth) that uses exactly 'blocks' BRAM blocks.
+    """
+    if blocks < 1:
+        raise FINNInternalError("Number of BRAM blocks must be at least 1")
+
+    # Invert the formula from calculate_bram_blocks based on bitwidth
+    if bitwidth == 1:
+        # blocks = ⌈depth/16384⌉
+        # Inversion: (blocks-1)*16384 < depth ≤ blocks*16384
+        min_depth = (blocks - 1) * 16384 + 1 if blocks > 1 else 1
+        max_depth = blocks * 16384
+    elif bitwidth == 2:
+        # blocks = ⌈depth/8192⌉
+        # Inversion: (blocks-1)*8192 < depth ≤ blocks*8192
+        min_depth = (blocks - 1) * 8192 + 1 if blocks > 1 else 1
+        max_depth = blocks * 8192
+    elif bitwidth <= 4:
+        # blocks = ⌈depth/4096⌉ * ⌈bitwidth/4⌉
+        bitwidth_factor = math.ceil(bitwidth / 4)
+        depth_blocks = math.ceil(blocks / bitwidth_factor)
+        min_depth = (depth_blocks - 1) * 4096 + 1 if depth_blocks > 1 else 1
+        max_depth = depth_blocks * 4096
+    elif bitwidth <= 9:
+        # blocks = ⌈depth/2048⌉ * ⌈bitwidth/9⌉
+        bitwidth_factor = math.ceil(bitwidth / 9)
+        depth_blocks = math.ceil(blocks / bitwidth_factor)
+        min_depth = (depth_blocks - 1) * 2048 + 1 if depth_blocks > 1 else 1
+        max_depth = depth_blocks * 2048
+    elif bitwidth <= 18:
+        # blocks = ⌈depth/1024⌉ * ⌈bitwidth/18⌉
+        bitwidth_factor = math.ceil(bitwidth / 18)
+        depth_blocks = math.ceil(blocks / bitwidth_factor)
+        min_depth = (depth_blocks - 1) * 1024 + 1
+        max_depth = depth_blocks * 1024
+    else:
+        # bitwidth > 18, split into two cases from original function
+        # Case 1: depth > 512 uses ⌈depth/1024⌉ * ⌈bitwidth/18⌉
+        # Case 2: depth ≤ 512 uses ⌈depth/512⌉ * ⌈bitwidth/36⌉
+
+        # Try the depth > 512 case first (⌈depth/1024⌉ * ⌈bitwidth/18⌉)
+        bitwidth_factor = math.ceil(bitwidth / 18)
+        depth_blocks = math.ceil(blocks / bitwidth_factor)
+
+        # Check if blocks is achievable with this bitwidth factor
+        if blocks % bitwidth_factor != 0 or depth_blocks < 1:
+            # Try the depth ≤ 512 case instead
+            pass
+        else:
+            min_depth = max((depth_blocks - 1) * 1024 + 1, 513)  # Must be > 512
+            max_depth = depth_blocks * 1024
+            # Check if this range is valid (entirely > 512)
+            if min_depth > 512 and calculate_bram_blocks(min_depth, bitwidth) == blocks:
+                return (min_depth, max_depth)
+
+        # Try the depth ≤ 512 case (⌈depth/512⌉ * ⌈bitwidth/36⌉)
+        bitwidth_factor = math.ceil(bitwidth / 36)
+        depth_blocks = math.ceil(blocks / bitwidth_factor)
+
+        # Check if blocks is achievable with this bitwidth factor
+        if blocks % bitwidth_factor != 0 or depth_blocks < 1:
+            return (0, 0)  # Invalid block count for this bitwidth
+
+        min_depth = (depth_blocks - 1) * 512 + 1 if depth_blocks > 1 else 1
+        max_depth = min(depth_blocks * 512, 512)  # Must be ≤ 512
+
+        # Verify the range is valid (entirely ≤ 512 and produces correct block count)
+        if max_depth <= 512 and calculate_bram_blocks(min_depth, bitwidth) == blocks:
+            return (min_depth, max_depth)
+
+        return (0, 0)  # No valid range found
+
+    # Verify the range is valid
+    if calculate_bram_blocks(min_depth, bitwidth) != blocks:
+        raise FINNInternalError("Calculated BRAM depth range is invalid!")
+    return (min_depth, max_depth)
+
+
+def calculate_uram_blocks(depth: int, bitwidth: int) -> int:
+    """Calculate the number of URAM blocks required for a URAM FIFO.
+
+    Args:
+        depth: FIFO depth
+        bitwidth: Data bitwidth
+    """
+    return (math.ceil(depth / 4096)) * (math.ceil(bitwidth / 72))
+
+
+def calculate_uram_depth_range(blocks: int, bitwidth: int) -> tuple[int, int]:
+    """Calculate the range of FIFO depths that use exactly the given number of URAM blocks.
+
+    Args:
+        blocks: Number of URAM blocks
+        bitwidth: Data bitwidth
+
+    Returns:
+        Tuple of (min_depth, max_depth) that uses exactly 'blocks' URAM blocks.
+        Returns (0, 0) if no valid range exists.
+    """
+    if blocks < 1:
+        return (0, 0)
+
+    # URAM formula: blocks = ⌈depth/4096⌉ * ⌈bitwidth/72⌉
+    bitwidth_factor = math.ceil(bitwidth / 72)
+
+    # Calculate depth range
+    # Minimum depth: (blocks / bitwidth_factor - 1) * 4096 + 1
+    # Maximum depth: (blocks / bitwidth_factor) * 4096
+
+    if blocks % bitwidth_factor != 0:
+        return (0, 0)  # Invalid block count for this bitwidth
+
+    depth_blocks = blocks // bitwidth_factor
+    min_depth = (depth_blocks - 1) * 4096 + 1 if depth_blocks > 1 else 1
+    max_depth = depth_blocks * 4096
+
+    # Verify
+    if calculate_uram_blocks(min_depth, bitwidth) != blocks:
+        return (0, 0)
+
+    return (min_depth, max_depth)
+
+
+def calculate_srl16e_luts(depth: int, bitwidth: int) -> int:
+    """Calculate the number of SRL16E LUTs required for a FIFO.
+
+    Args:
+        depth: FIFO depth (must be >= 2)
+        bitwidth: Data bitwidth
+
+    Returns:
+        Number of SRL16E LUTs required without adress LUTs.
+
+    Formula: LUTs = ⌈depth/32⌉ x ⌈bitwidth/2⌉
+    """
+    ram_luts = (math.ceil(depth / 32)) * (math.ceil(bitwidth / 2))
+    return ram_luts
+
+
+def calculate_srl16e_depth_range(luts: int, bitwidth: int) -> tuple[int, int]:
+    """Calculate the range of FIFO depths that use exactly the given number of SRL16E LUTs.
+
+    Args:
+        luts: Number of SRL16E LUTs
+        bitwidth: Data bitwidth
+
+    Returns:
+        Tuple of (min_depth, max_depth) that uses exactly 'luts' LUTs.
+        Returns (0, 0) if no valid range exists.
+    """
+    if luts < 1:
+        return (0, 0)
+
+    # SRL16E formula: luts = ⌈depth/32⌉ * ⌈bitwidth/2⌉
+    bitwidth_factor = math.ceil(bitwidth / 2)
+
+    # Calculate depth range
+    if luts % bitwidth_factor != 0:
+        return (0, 0)  # Invalid LUT count for this bitwidth
+
+    depth_blocks = luts // bitwidth_factor
+    min_depth = (depth_blocks - 1) * 32 + 1 if depth_blocks > 1 else 2
+    max_depth = depth_blocks * 32
+
+    # Verify
+    if calculate_srl16e_luts(min_depth, bitwidth) != luts:
+        return (0, 0)
+
+    return (min_depth, max_depth)
diff --git a/src/finn/transformation/fpgadataflow/simulation_controller.py b/src/finn/transformation/fpgadataflow/simulation_controller.py
new file mode 100644
index 0000000000..feaff91bad
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/simulation_controller.py
@@ -0,0 +1,340 @@
+"""Control (node based) simulations via unix sockets."""
+
+import json
+import os
+import socket
+import subprocess
+import threading
+import time
+from pathlib import Path
+from rich.console import Console
+from threading import Lock
+from typing import Any
+
+from finn.util.basic import make_build_dir
+from finn.util.exception import FINNInternalError
+from finn.util.logging import ThreadsafeProgressDisplay
+
+
+class SimulationController:
+    """Control a node-node IPC connected simulation in threads."""
+
+    def __init__(
+        self,
+        parallel_simulations: int,
+        names: list[str],
+        binaries: list[Path],
+        console: Console,
+        poll_interval: float = 1.0,
+        with_progressbar: bool = True,
+    ) -> None:
+        """Create a new controller, without starting the simulation.
+
+        Args:
+            parallel_simulations: Number of simulations to run in parallel.
+            names: List of names for the simulations.
+            binaries: List of paths to the simulation binaries.
+            console: The rich.console.Console to print with.
+            poll_interval: How long the wait between checks of the processes stdout/stdin is.
+            with_progressbar: Whether or not to display a progressbar for the cycle count.
+        """
+        if len(names) != len(binaries):
+            raise FINNInternalError(
+                f"Simulation controller received non-matching "
+                f"name and binary count: {len(names)} and {len(binaries)}"
+            )
+        self.binaries = binaries
+        self.names = names
+        self.console = console
+        self.poll_interval = poll_interval
+        self.workers = parallel_simulations
+        self.progress = None
+        if with_progressbar:
+            self.progress = ThreadsafeProgressDisplay(names, [0] * len(names), names)
+        self.running_lock = Lock()
+        self.running = 0
+        self.total = len(names)
+        self.logdir = Path(make_build_dir("simulation_logfiles_"))
+
+        # Socket communication management
+        self.processes: list[tuple[subprocess.Popen, Any, Any]] = []
+        self.sockets: list[tuple[socket.socket, str]] = []
+
+        # Early termination flag
+        self.should_stop = False
+        self.stop_lock = Lock()
+
+    def _start_process(self, binary: Path, process_id: int, cpu: int = -1) -> int:
+        """Start a single C++ simulation process with its own Unix socket.
+
+        Args:
+            binary: Path to the simulation executable
+            process_id: Unique identifier for this process
+            cpu: CPU core to bind to (if -1, no binding)
+
+        Returns:
+            Index of the started process
+        """
+        thread_id = threading.get_ident()
+
+        # Create unique socket path which includes thread ID to avoid conflicts
+        # with multiple threads
+        socket_path = Path(f"/tmp/fifosim_sockets/{thread_id}/")
+        socket_path.mkdir(parents=True, exist_ok=True)
+        socket_path = socket_path / f"sim_socket_{process_id}.sock"
+
+        # Remove socket if it exists
+        if socket_path.exists():
+            socket_path.unlink()
+
+        # Build command arguments
+        cmd = [str(binary), "--socket", socket_path]
+
+        # Create log files for stdout and stderr
+        stdout_log = self.logdir / f"{process_id}_stdout_cpp.log"
+        stderr_log = self.logdir / f"{process_id}_stderr_cpp.log"
+
+        stdout_file = stdout_log.open("w")
+        stderr_file = stderr_log.open("w")
+
+        # Start C++ process - redirect stdout/stderr to files
+        cwd = binary.parent
+        # Set CPU affinity if a specific core is requested
+        preexec_fn = (lambda: os.sched_setaffinity(0, {cpu})) if cpu != -1 else None
+        proc = subprocess.Popen(
+            cmd, stdout=stdout_file, stderr=stderr_file, text=True, cwd=cwd, preexec_fn=preexec_fn
+        )
+
+        # Check if process started successfully
+        time.sleep(0.2)  # Give process time to fail if there's an immediate error
+        if proc.poll() is not None:
+            stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr"
+            stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout"
+            stdout_file.close()
+            stderr_file.close()
+            msg = (
+                f"C++ process exited immediately with code {proc.returncode}\n"
+                f"Stderr: {stderr_output}\nStdout: {stdout_output}"
+            )
+            self.console.log(str(process_id) + ": " + msg)
+            raise RuntimeError(msg)
+
+        # Create Unix socket and connect
+        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+
+        # Wait for C++ process to create socket (with timeout)
+        max_retries = 100  # 20 seconds total
+        connected = False
+        for i in range(max_retries):
+            # Check if process is still alive
+            if proc.poll() is not None:
+                stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr"
+                stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout"
+                stdout_file.close()
+                stderr_file.close()
+                msg = (
+                    f"C++ process died during socket wait with code {proc.returncode}\n"
+                    f"Stderr: {stderr_output}\nStdout: {stdout_output}"
+                )
+                self.console.log(str(process_id) + ": " + msg)
+                raise RuntimeError(msg)
+
+            try:
+                sock.connect(str(socket_path))
+                connected = True
+                break
+            except (FileNotFoundError, ConnectionRefusedError) as e:
+                if i == max_retries - 1:
+                    stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr"
+                    stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout"
+                    stdout_file.close()
+                    stderr_file.close()
+                    msg = (
+                        f"Failed to connect to socket after {max_retries} retries\n"
+                        f"Stderr: {stderr_output}\nStdout: {stdout_output}"
+                    )
+                    self.console.log(str(process_id) + ": " + msg)
+                    raise RuntimeError(msg) from e
+                time.sleep(0.2)
+
+        if not connected:
+            stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr"
+            stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout"
+            stdout_file.close()
+            stderr_file.close()
+            msg = (
+                f"Failed to connect to socket {socket_path}\n"
+                f"Stderr: {stderr_output}\nStdout: {stdout_output}"
+            )
+            self.console.log(str(process_id) + ": " + msg)
+            raise RuntimeError(msg)
+
+        self.processes.append((proc, stdout_file, stderr_file))
+        self.sockets.append((sock, str(socket_path)))
+        return len(self.processes) - 1
+
+    def _send_command(self, process_idx: int, command: str, payload: dict[str, Any]) -> None:
+        """Send command and payload to a specific process.
+
+        Args:
+            process_idx: Index of the process to send to
+            command: Command string (e.g., "start", "status", "stop")
+            payload: Dictionary containing command-specific data
+        """
+        sock, _ = self.sockets[process_idx]
+
+        message = {"command": command, "payload": payload}
+
+        # Send length-prefixed message
+        msg_str = json.dumps(message)
+        msg_bytes = msg_str.encode("utf-8")
+        length = len(msg_bytes)
+
+        # Send 4-byte length prefix (little-endian)
+        sock.sendall(length.to_bytes(4, byteorder="little"))
+        # Send actual message
+        sock.sendall(msg_bytes)
+
+    def _receive_response(self, process_idx: int) -> dict[str, Any] | None:
+        """Receive response from a specific process.
+
+        Args:
+            process_idx: Index of the process to receive from
+
+        Returns:
+            Dictionary containing the response, or None if error
+
+        Raises:
+            TimeoutError: If socket times out waiting for response
+        """
+        sock, _ = self.sockets[process_idx]
+
+        # Set 120 second timeout to prevent deadlocks
+        # Needs to be rather larger to give the simulation IO thread time to answer
+        sock.settimeout(120.0)
+
+        # Read 4-byte length prefix
+        length_bytes = sock.recv(4)
+        if not length_bytes:
+            self.console.log(f"{process_idx}: Client disconnected.")
+            return None
+
+        length = int.from_bytes(length_bytes, byteorder="little")
+
+        # Read message data
+        msg_bytes = b""
+        while len(msg_bytes) < length:
+            chunk = sock.recv(length - len(msg_bytes))
+            if not chunk:
+                break
+            msg_bytes += chunk
+
+        return json.loads(msg_bytes.decode("utf-8"))
+
+    def _send_and_receive(
+        self, process_idx: int, command: str, payload: dict[str, Any]
+    ) -> dict[str, Any] | None:
+        """Send command and wait for response (convenience method).
+
+        Args:
+            process_idx: Index of the process
+            command: Command string
+            payload: Command payload
+
+        Returns:
+            Response dictionary
+
+        Raises:
+            RuntimeError: If the subprocess has terminated with an error
+        """
+        try:
+            self._send_command(process_idx, command, payload)
+            response = self._receive_response(process_idx)
+
+            # If we got None (timeout or connection error), check if process crashed
+            if response is None:
+                proc, stdout_file, stderr_file = self.processes[process_idx]
+                returncode = proc.poll()
+
+                if returncode is not None and returncode != 0:
+                    # Process has terminated with an error
+                    # Flush and read error logs
+                    stdout_file.flush()
+                    stderr_file.flush()
+
+                    stdout_log = self.logdir / f"{process_idx}_stdout_cpp.log"
+                    stderr_log = self.logdir / f"{process_idx}_stderr_cpp.log"
+
+                    stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr"
+                    stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout"
+
+                    # Raise the actual error from the subprocess
+                    msg = (
+                        f"Subprocess (process_idx={process_idx}) terminated with"
+                        f" exit code {returncode}.\n"
+                        f"Stderr:\n{stderr_output}\n"
+                        f"Stdout:\n{stdout_output}"
+                    )
+                    raise RuntimeError(msg) from None
+
+            return response
+        except (BrokenPipeError, ConnectionResetError, TimeoutError) as err:
+            # Connection error or timeout means the subprocess may have died
+            # Check if it exited with an error and raise that instead
+            proc, stdout_file, stderr_file = self.processes[process_idx]
+            returncode = proc.poll()
+
+            if returncode is not None and returncode != 0:
+                # Process has terminated with an error
+                # Flush and read error logs
+                stdout_file.flush()
+                stderr_file.flush()
+
+                stdout_log = self.logdir / f"{process_idx}_stdout_cpp.log"
+                stderr_log = self.logdir / f"{process_idx}_stderr_cpp.log"
+
+                stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr"
+                stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout"
+
+                # Raise the actual error from the subprocess
+                msg = (
+                    f"Subprocess (process_idx={process_idx}) terminated with"
+                    f" exit code {returncode}.\n"
+                    f"Stderr:\n{stderr_output}\n"
+                    f"Stdout:\n{stdout_output}"
+                )
+                raise RuntimeError(msg) from err  # from None
+
+            # If process exited cleanly (returncode == 0) or hasn't exited yet,
+            # this is an unexpected connection error
+            return None
+
+    def _cleanup_sockets(self) -> None:
+        """Close all sockets and terminate all processes."""
+        # Send stop command to all processes
+        errors = []
+        for i in range(len(self.processes)):
+            try:
+                self._send_command(i, "stop", {})
+                self._receive_response(i)
+            except Exception as e:  # noqa
+                errors.append((i, e))
+
+        # Close sockets
+        for sock, socket_path in self.sockets:
+            sock.close()
+            socket_path_obj = Path(socket_path)
+            if socket_path_obj.exists():
+                socket_path_obj.unlink(True)
+
+        # Terminate processes and close file handles
+        for proc, stdout_file, stderr_file in self.processes:
+            proc.terminate()
+            try:
+                proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait()
+            finally:
+                stdout_file.close()
+                stderr_file.close()
diff --git a/src/finn/transformation/fpgadataflow/simulation_isolated.py b/src/finn/transformation/fpgadataflow/simulation_isolated.py
new file mode 100644
index 0000000000..a597918c45
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/simulation_isolated.py
@@ -0,0 +1,642 @@
+"""Simulating layers on their own to observe their behaviour."""
+import io
+import json
+import pandas as pd
+import re
+import time
+from collections.abc import Callable
+from concurrent.futures import Future, ThreadPoolExecutor
+from pathlib import Path, PosixPath, PurePath
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.base import Transformation
+from rich.console import Console
+from threading import Lock
+from typing import Any, Literal, TypeAlias
+
+from finn.transformation.fpgadataflow.simulation import Simulation, store_fifo_data
+from finn.transformation.fpgadataflow.simulation_build import SimulationType
+from finn.transformation.fpgadataflow.simulation_controller import SimulationController
+from finn.util.exception import FINNInternalError
+from finn.util.logging import log
+
+
+def get_time() -> str:
+    """Return the current time in a formatted hour:minutes:second string."""
+    return f"[{time.strftime('%H:%M:%S')}]"
+
+
+class NodeIsolatedSimulationController(SimulationController):
+    """Run simulations for node isolated cases."""
+
+    IsolatedSimLogData = dict[Literal["ready", "valid"], list[dict[str, int]]]
+
+    def __init__(
+        self,
+        parallel_simulations: int,
+        names: list[str],
+        binaries: list[Path],
+        console: Console,
+        poll_interval: float = 1.0,
+        with_progressbar: bool = False,
+    ) -> None:
+        """Set up node isolated simulation."""
+        super().__init__(
+            parallel_simulations, names, binaries, console, poll_interval, with_progressbar
+        )
+        log.info("Started simulation controller")
+
+    def get_logfile_path(self, binary_or_idx: Path | int) -> Path:
+        """Get the logfile for the given binary or process index."""
+        if type(binary_or_idx) is int:
+            return (
+                self.logdir / f"{binary_or_idx}_log_isolated_"
+                f"{self.names[binary_or_idx]}_python.txt"
+            )
+        elif type(binary_or_idx) in [Path, PurePath, PosixPath]:  # noqa
+            process_idx = self.binaries.index(binary_or_idx)  # type: ignore
+            return self.logdir / f"{process_idx}_log_isolated_{self.names[process_idx]}_python.txt"
+        raise TypeError("Pass either a simulation binary path of an index")
+
+    def write_log(self, logfile: io.TextIOWrapper, msg: str, flush: bool = True) -> None:
+        """Write a timestamped message to log."""
+        logfile.write(f"{get_time()} {msg}\n")
+        if flush:
+            logfile.flush()
+
+    def collect_results(
+        self, d: Path, readylog_name: str = "readylog.txt", validlog_name: str = "validlog.txt"
+    ) -> IsolatedSimLogData:
+        """Recieve the directory containing a binary and the simulation logs.
+        If no logs are found raises an error, otherwise return the postprocessed logs
+        read from JSON.
+        """
+        readylog = d / readylog_name
+        validlog = d / validlog_name
+        if not readylog.exists() or not validlog.exists():
+            raise FINNInternalError(f"Could not find simulation logs at {readylog} and {validlog}")
+        return {
+            "ready": json.loads(readylog.read_text()),
+            "valid": json.loads(validlog.read_text()),
+        }
+
+    def run(self) -> dict[str, IsolatedSimLogData]:
+        """Run a node isolated simulation and return the collected
+        input ready / output valid data, indexed based on node names."""
+        futures: list[Future] = []
+        datalock = Lock()
+        total = len(self.binaries)
+        done = 0
+
+        # Important to initialize from names. Otherwise the results are added into the dict
+        # in the order in which they finished simulating. But we want to keep the model order.
+        data: dict[str, self.IsolatedSimLogData] = {name: {} for name in self.names}
+
+        # TODO: Lock not needed; futures are not consumed just by
+        # TODO: using the callback, so we can unpack them later
+
+        # Callback to show progress and save the simulation result
+        def _done_callback_generator(name: str) -> Callable:
+            nonlocal total, done, data, datalock
+
+            def _f(future: Future) -> None:
+                nonlocal total, done, data, datalock
+                with datalock:
+                    done += 1
+                    log.info(
+                        f"[ [bold green]{int(100 * float(done)/float(total))}%"
+                        f"[/bold green] ] {name} done!",
+                        extra={"markup": True, "highlighter": None},
+                    )
+                    data[name] = future.result()
+
+            return _f
+
+        # Running the simulation threads
+        assert len(self.names) == len(self.binaries)
+        with self.console.status(f"Running simulation on every node. Log directory: {self.logdir}"):
+            start = time.time()
+            with ThreadPoolExecutor(len(self.binaries)) as tpe:
+                for i, binary in enumerate(self.binaries):
+                    futures.append(tpe.submit(self._run_binary, binary))
+                    futures[-1].add_done_callback(_done_callback_generator(self.names[i]))
+                tpe.shutdown(wait=True)
+            elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start))
+            log.info("Thread pool closed. Closing sockets and postprocessing data")
+            log.info(f"Simulations took {elapsed}")
+
+        # Finish the logs and clean up the sockets
+        for binary in self.binaries:
+            with self.get_logfile_path(binary).open("a") as logfile:
+                self.write_log(logfile, "Cleaning up socket.")
+        self._cleanup_sockets()
+
+        # Check for invalid data points
+        invalid = []
+        for i, name in enumerate(data.keys()):
+            if data[name] is None:
+                invalid.append((name, i))
+        if len(invalid) > 0:
+            raise FINNInternalError(
+                f"Lost connection / malformed response from nodes: "
+                f"{', '.join([str(x) for x in invalid])}"
+            )
+        return data
+
+    def _run_binary(self, binary: Path) -> IsolatedSimLogData | None:
+        """Thread routine: Run a single simulation from the given path and return
+        the collected results. Returns None if connection is lost."""
+        process_index = self.binaries.index(binary)
+        with self.get_logfile_path(binary).open("w+") as logfile:
+            # Logging helper
+            def write_log(msg: str) -> None:
+                self.write_log(logfile, msg)
+
+            # Initialize: Start simulation process and give the start command
+            write_log("Initializing simulation")
+            write_log(f"Binary is: {binary}")
+            proc_idx = self._start_process(binary, process_index)
+            response = self._send_and_receive(proc_idx, "start", {})
+            if response is None:
+                write_log(
+                    "No answer for the clients 'start' " "command received. Timeout or disconnect."
+                )
+                return None
+            write_log(f"Start response: {response}")
+
+            # Main loop
+            write_log("Beginning main loop")
+            logfile.flush()
+            total_status_requests = 0
+            while True:
+                # Request status in regular intervals
+                time.sleep(self.poll_interval)
+                write_log("Sending status request")
+                response = self._send_and_receive(proc_idx, "status", {})
+                total_status_requests += 1
+                write_log(f"Status request {total_status_requests} sent.")
+
+                # Process response
+                if response is None:
+                    write_log("Status request answered with None: Timeout or connection lost.")
+                    return None
+                state = response["state"]
+                write_log(f"Received answer for status request ({total_status_requests})")
+
+                # If the simulation is done, postprocess and return the collected data
+                if state == "done":
+                    write_log("Received done status. Sending stop signal to simulation.")
+                    resp = self._send_and_receive(proc_idx, "stop", {})
+                    if resp is None:
+                        write_log("No stop response received.")
+                    else:
+                        write_log("Stop successfully received.")
+                    return self.collect_results(binary.parent)
+
+                # Otherwise log the current status
+                # TODO: Field name - meaning wrong?
+                in_done = response["inputCyclesDone"]
+                in_target = response["inputCyclesTarget"]
+                out_done = response["outputCyclesDone"]
+                out_target = response["outputCyclesTarget"]
+                total_cycles = response["totalCycles"]
+                percent_simulated_input = int(100.0 * float(in_done) / float(in_target))
+                percent_simulated_output = int(100.0 * float(out_done) / float(out_target))
+                write_log("Status response:")
+                write_log(f"\tTotal cycles: {total_cycles}")
+                write_log(
+                    f"\tInput data simulated: {percent_simulated_input}% "
+                    f"({in_done} / {in_target})"
+                )
+                write_log(
+                    f"\tOutput data simulated: {percent_simulated_output}% "
+                    f"({out_done} / {out_target})"
+                )
+
+
+FIFODepthConfig: TypeAlias = dict[int, dict[str, str | list[int]]]
+IsoSimLogData = NodeIsolatedSimulationController.IsolatedSimLogData
+IsoSimLogDataByLayer = dict[str, IsoSimLogData]  # Indexed by layer name
+
+
+class IsolatedSimulation(Simulation):
+    def __init__(
+        self,
+        model: ModelWrapper,
+        simulation_type: SimulationType,
+        fpgapart: str,
+        clk_ns: float,
+        functional_sim: bool,
+        workers: int | None = None,
+    ) -> None:
+        super().__init__(model, simulation_type, fpgapart, clk_ns, functional_sim, workers)
+
+    def simulate(self) -> IsoSimLogDataByLayer:
+        """Simulate isolated nodes."""
+        if self.simulation_type != SimulationType.NODE_BASED_ISOLATED:
+            raise FINNInternalError(
+                f"Called simulation function 'simulate_node_isolated' "
+                f"does not match provided simulation type "
+                f"{self.simulation_type}"
+            )
+        names = [node.name for node in self.model.graph.node]
+        console = Console()
+        controller = NodeIsolatedSimulationController(
+            len(self.binaries), names, list(self.binaries.values()), console, 0.1, False
+        )
+        return controller.run()
+
+
+class RunLayerIsolatedSimulation(Transformation):
+    """Run a layer isolated simulation and calculate some information for a
+    later layer parallel simulation.
+
+    This modifies or creates a pandas DF and stores it in a csv file. This file can be
+    modified by the node connected simulation as well."""
+
+    def __init__(
+        self, fpgapart: str, clk_ns: float, functional_sim: bool, output_dir: Path
+    ) -> None:
+        """Run isolated layer simulations. The
+        default location is at cfg.output_dir/report/fifo_data.csv."""
+        super().__init__()
+        self.fpgapart = fpgapart
+        self.clk_ns = clk_ns
+        self.functional_sim = functional_sim
+        self.output_dir = output_dir
+
+        # Read / create dataframe with default path
+        self.default_fifo_data_path = self.output_dir / "report" / "fifo_data.csv"
+
+    def calculate_upper_bounds(self, data: IsoSimLogDataByLayer) -> dict[str, dict[str, int]]:
+        """Try to calculate an upper bound for the incoming FIFO size of the layers.
+        Return size indexed by layer name and stream name.
+
+        >>> step = RunLayerIsolatedSimulation("", 0.0, False)
+        >>> bounds = step.calculate_upper_bounds({
+        ... "A": {
+        ...         "ready": [
+        ...             {"totalCycles": 43, "inputCyclesDone": 12,
+        ...             "inputCyclesTarget": 24, "s_axi_0": 1, "s_axi_1": 0},
+        ...             {"totalCycles": 44, "inputCyclesDone": 13,
+        ...             "inputCyclesTarget": 24, "s_axi_0": 0, "s_axi_1": 0},
+        ...         ], "valid": []
+        ... },
+        ... "B": {
+        ...         "ready": [
+        ...             {"totalCycles": 100, "inputCyclesDone": 3,
+        ...             "inputCyclesTarget": 10, "s_axi_0": 1, "s_axi_1": 1,
+        ...             "s_axi_2": 0},
+        ...         ], "valid": []
+        ... },
+        ... "C": {
+        ...         "ready": [
+        ...             {"totalCycles": 43, "inputCyclesDone": 14,
+        ...             "inputCyclesTarget": 24, "s_axi_0": 1, "s_axi_1": 0},
+        ...             {"totalCycles": 44, "inputCyclesDone": 15,
+        ...             "inputCyclesTarget": 24, "s_axi_0": 0, "s_axi_1": 0},
+        ...         ], "valid": []
+        ... }
+        ... })
+        >>> bounds["A"]
+        {'s_axi_0': 1, 's_axi_1': 2}
+        >>> bounds["B"]
+        {'s_axi_0': 0, 's_axi_1': 0, 's_axi_2': 1}
+        >>> bounds["C"]
+        {'s_axi_0': 0, 's_axi_1': 0}
+        """
+
+        # TODO: Proper pytest tests
+        def _any_ready(cycle_data: dict[str, int]) -> bool:
+            for key in cycle_data.keys():
+                if (
+                    key not in ["totalCycles", "inputCyclesDone", "inputCyclesTarget"]
+                    and cycle_data[key] == 1
+                ):
+                    return True
+            return False
+
+        results: dict[str, dict[str, int]] = {}
+        for layer in data.keys():
+            # Save all keys that are not
+            results[layer] = {
+                stream_name: 0
+                for stream_name in data[layer]["ready"][0].keys()
+                if stream_name not in ["inputCyclesDone", "inputCyclesTarget", "totalCycles"]
+            }
+            for cycle_data_ready, cycle_data_valid in zip(
+                data[layer]["ready"], data[layer]["valid"], strict=True
+            ):
+                if cycle_data_ready["inputCyclesDone"] > int(
+                    cycle_data_ready["inputCyclesTarget"] / 2.0
+                ) and cycle_data_valid["outputCyclesDone"] > int(
+                    cycle_data_valid["outputCyclesTarget"] / 2.0
+                ):
+                    break
+                for stream_name in results[layer].keys():
+                    # TODO: Currently on the C++ side we multiply the
+                    # TODO: target cycles by 2, to get two samples
+                    # TODO: We keep track of ready signals until we see
+                    # TODO: the first ready after half of all cycles were seen.
+                    # TODO: This might change in the future
+                    if (
+                        cycle_data_ready["inputCyclesTarget"] % 2 != 0
+                        or cycle_data_valid["outputCyclesTarget"] % 2 != 0
+                    ):
+                        raise FINNInternalError(
+                            f"An 'inputCyclesTarget' / 'outputCyclesTarget' of layer {layer} seems "
+                            f"to not be an even number. Currently, we double "
+                            f"the target simulation cycles for every layer "
+                            f"on the C++ side. This error may point towards "
+                            f"a change on the C++ side, which may cause the "
+                            f"need to update this function accordingly!"
+                        )
+                    results[layer][stream_name] += int(cycle_data_ready[stream_name] == 0)
+
+        # TODO: This calculation assumes, that if the producer does NOT fire the entire time,
+        # TODO: the consumer can read at least at the same speed as
+        #       if the producer did, and not slower.
+        # TODO: (Since this would mean that less data pressure from
+        #       the producer makes the consumer _slower_.)
+        # TODO: This should usually be the case, but is important to keep in mind.
+        return results
+
+    def sanity_check_logged_data(self, data: IsoSimLogDataByLayer) -> None:
+        """Do checks on the returned data to make sure it is in spec.
+
+        A correctly formatted example would be:
+        >>> data = {
+        ...     "layer1": {
+        ...         "ready": [{"totalCycles": 10, "inputCyclesDone": 5,
+        ...                 "inputCyclesTarget": 10, "s_axi_0": 1}],
+        ...         "valid": [{"totalCycles": 10, "outputCyclesDone": 5,
+        ...                 "outputCyclesTarget": 10, "m_axi_0": 1}]
+        ...     }
+        ... }
+        >>> sim = RunLayerIsolatedSimulation("", 0.0, False)
+        >>> sim.sanity_check_logged_data(data)
+        >>>
+        """
+        # 0. Valid and ready are present
+        for layer, ldata in data.items():
+            if "valid" not in ldata.keys():
+                raise FINNInternalError(
+                    f"Simulation log data of layer {layer} is missing the VALID log."
+                )
+            if "ready" not in ldata.keys():
+                raise FINNInternalError(
+                    f"Simulation log data of layer {layer} is missing the READY log."
+                )
+        # 1. All cycle datas are uniform and have at least one stream signal
+        for i, (layer, ldata) in enumerate(data.items()):
+            cycle_data = ldata["ready"] + ldata["valid"]
+            lengths: set[int] = {len(cycle.keys()) for cycle in cycle_data}
+            if len(lengths) != 1:
+                raise FINNInternalError(
+                    f"Simulation log data inconsistent for layer "
+                    f"{layer} ({i}). Differing number of fields per cycle."
+                )
+            if next(iter(lengths)) < 4:
+                raise FINNInternalError(
+                    f"Simulation for layer {layer} must contain "
+                    f"atleast 4 fields (total cycles, AXI cycles "
+                    f"done, AXI cycles target and at least one AXI "
+                    f"ready/valid signal)!"
+                )
+        # 2. All ready logs contain the required keywords
+        readykeys = ["inputCyclesDone", "inputCyclesTarget", "totalCycles"]
+        for rlayer, rdata in data.items():
+            for cycle in rdata["ready"]:
+                if any(keyword not in cycle.keys() for keyword in readykeys):
+                    raise FINNInternalError(
+                        f"Simulation READY log of layer {rlayer} "
+                        f"contains cycles that are missing a required key."
+                    )
+                if any(key not in readykeys and "axi" not in key for key in cycle.keys()):
+                    raise FINNInternalError(
+                        f"In the READY simulation log of layer "
+                        f"{rlayer} there seem to be fields that "
+                        f"are not expected keywords or AXI streams!"
+                    )
+        # 3. All valid logs contain the required keywords
+        validkeys = ["outputCyclesDone", "outputCyclesTarget", "totalCycles"]
+        for vlayer, vdata in data.items():
+            for cycle in vdata["valid"]:
+                if any(keyword not in cycle.keys() for keyword in validkeys):
+                    raise FINNInternalError(
+                        f"Simulation VALID log of layer {vlayer} "
+                        f"contains cycles that are missing a required key."
+                    )
+                if any(key not in validkeys and "axi" not in key for key in cycle.keys()):
+                    raise FINNInternalError(
+                        f"In the VALID simulation log of layer "
+                        f"{vlayer} there seem to be fields that "
+                        f"are not expected keywords or AXI streams!"
+                    )
+        # 4. Cycles done can never be larger then the number of total cycles passed in the sim
+        for layer, cdata in data.items():
+            for line in cdata["ready"] + cdata["valid"]:
+                if (
+                    "inputCyclesDone" in line.keys()
+                    and line["inputCyclesDone"] > line["totalCycles"]
+                ):
+                    raise FINNInternalError(
+                        f"Simulation log of layer {layer} looks incorrect: "
+                        f"Number of active receiving cycles "
+                        f"({line['inputCyclesDone']}) larger than number of "
+                        f"total cycles passed ({line['totalCycles']})."
+                    )
+                if (
+                    "outputCyclesDone" in line.keys()
+                    and line["outputCyclesDone"] > line["totalCycles"]
+                ):
+                    raise FINNInternalError(
+                        f"Simulation log of layer {layer} looks incorrect: "
+                        f"Number of active producing cycles "
+                        f"({line['outputCyclesDone']}) larger than number of "
+                        f"total cycles passed ({line['totalCycles']})."
+                    )
+        # 5. Stream keywords can never have any other value than 1 (HIGH) or 0 (LOW)
+        reserved_keywords = readykeys + validkeys
+        for layer, ldata in data.items():
+            for cycle_data in ldata["ready"] + ldata["valid"]:
+                for key in cycle_data.keys():
+                    if key not in reserved_keywords and cycle_data[key] not in [0, 1]:
+                        raise FINNInternalError(
+                            f"Layer {layer} has data point where a "
+                            f"non-reserved field (thus an axi stream "
+                            f"ready/valid signal) is neither 0 nor 1: "
+                            f"Key: {key}, Value: {cycle_data[key]}"
+                        )
+        # 6. Data is not empty
+        for layer, ldata in data.items():
+            if len(ldata["ready"]) == 0:
+                raise FINNInternalError(f"Layer {layer} has no ready data!")
+            if len(ldata["valid"]) == 0:
+                raise FINNInternalError(f"Layer {layer} has no valid data!")
+        # 7. Check that the order of axi streams corresponds to their names. This helps
+        # somewhat to guarantee that the order always stayed the same from building the simulations
+        # to evaluating their data
+
+        # The number in the name should increase with every stream, from 0, without gaps
+        # and streams should be called "s_axis_<number>"
+        readykeys = ["inputCyclesDone", "inputCyclesTarget", "totalCycles"]
+        for layer, ldata in data.items():
+            for cycledict in ldata["ready"]:
+                current_stream_idx = 0
+                for key in cycledict.keys():
+                    if key not in readykeys:
+                        m = re.fullmatch(r"^s_axis_(\d+)$", key)
+                        if m is None:
+                            raise FINNInternalError(
+                                f"Layer {layer} has a non-expected key that "
+                                f"does not match the names of streams expected "
+                                f"(s_axis_<number>).\n\tKey is: {key}"
+                            )
+                        stream_idx = m.group(1)
+                        if int(stream_idx) != current_stream_idx:
+                            raise FINNInternalError(
+                                f"Layer {layer} has non-expected stream key "
+                                f"that does not follow the expected index "
+                                f"scheme: Current expected index is "
+                                f"{current_stream_idx}. Got instead: "
+                                f"{stream_idx}"
+                            )
+                        current_stream_idx += 1
+        # TODO: Check that names match vivado_stitch_ifnames.
+        # TODO: Currently there is no easy way to do this, since we never save the isolated
+        # TODO: node-models and vivado_stitch_ifnames is a metadata prop of that isolated model
+
+    def percent_ready(self, data: IsoSimLogDataByLayer) -> dict[str, float]:
+        """Calculate how many percent of the time the layer was ready for input data.
+        Return indexed by layer name."""
+        # TODO: Implement
+        return dict.fromkeys(data, 0)
+
+    def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, bool]:
+        """Run isolated layer simulations."""
+        # Run the simulation
+        sim = IsolatedSimulation(
+            model,
+            SimulationType.NODE_BASED_ISOLATED,
+            self.fpgapart,
+            self.clk_ns,
+            self.functional_sim,
+        )
+        data: IsoSimLogDataByLayer = sim.simulate()
+
+        # Check if data looks good
+        log.info("Checking validity of received simulation data...")
+        start = time.time()
+        self.sanity_check_logged_data(data)
+        log.info(f"Validity check took {time.time() - start} seconds.")
+
+        # Calculate upper bounds
+        log.info("Estimating upper bounds...")
+        start = time.time()
+        in_fifo_upper_bound = self.calculate_upper_bounds(data)
+        log.info(f"Estimation took {time.time() - start} seconds.")
+
+        # Write into report file
+        upper_bounds_file = self.output_dir / "report" / "estimate_upper_fifo_bound.json"
+        upper_bounds_file.write_text(json.dumps(in_fifo_upper_bound, indent=4))
+        log.info(f"Wrote results to: {upper_bounds_file}")
+
+        # Save data into dataframe
+        # NOTE: We actually have to swap the order here: We recorded the _incoming_ FIFO sizes
+        # However the connected simulation stores the depths on the layers before it, so
+        # essentially _outgoing_ FIFO sizes.
+
+        # NOTE: For this mapping to work, ordering has to be kept correctly in each step:
+        # 1. Mapping node.inputs to vivado_stitch_ifnames metadata prop (CreateStitchedIP)
+        # 2. Mapping IO shapes to ifnames from before (simulation_builder.py)
+        # 3. Mapping stream_descrs to M/S_AXIS_CONTROL array (C++ simulation creation)
+        # 4. Writing the data to json. Order of S_AXIS_CONTROL -> order in which JSON gets written
+        #       IMPORTANT: Use nlohmann::ordered_json to keep the insertion order!
+        # 5. Reading the JSON into python (python dicts are ordered since 3.7)
+        #       According to docs, the Python JSON module also keeps order
+        # 6. Syncing node.inputs to order of s_axi_... streams read from the JSON.
+        edited_bounds = {}
+
+        # Fill edited_bounds with empty values
+        for node in model.graph.node:
+            suc = model.find_direct_successors(node)
+            if suc is None:
+                edited_bounds[node.name] = [-1]
+            else:
+                edited_bounds[node.name] = [-1] * len(suc)
+
+        # For every node check its predecessors.
+        # Find the index/tensor that connects the predecessor and the current one
+        # Use that index to retrieve the fifo depth between them and save it
+        def get_index(a: Any, values: Any) -> int | None:
+            for i, val in enumerate(values):
+                if val == a:
+                    return i
+            return None
+
+        for node in model.graph.node:
+            # Rely on the fact that find_direct_predecessors gives the streams in-order
+            predecessors = model.find_direct_predecessors(node)
+            if predecessors is None:
+                continue
+            for predecessor in predecessors:
+                # Find out which m_axis stream of the predecessor leads to node
+                for producer_idx, pre_out in enumerate(predecessor.output):
+                    if pre_out in node.input:
+                        consumer_idx = get_index(pre_out, node.input)
+                        if consumer_idx is None:
+                            raise FINNInternalError(
+                                f"Could not find index of "
+                                f"{predecessor.name}'s output and "
+                                f"{node.name}'s input: {pre_out}. "
+                                f"Index in predecessor.output is "
+                                f"{producer_idx}"
+                            )
+                        # TODO: Switch to array instead of dict?
+                        # We have to conver the string-key (s_axi_...) into the index of the dict
+                        key = list(in_fifo_upper_bound[node.name].keys())[consumer_idx]
+                        # TODO: Tests
+                        edited_bounds[predecessor.name][producer_idx] = in_fifo_upper_bound[
+                            node.name
+                        ][
+                            key
+                        ]  # noqa
+                        log.info(
+                            f"Incoming FIFO {node.name}[{key}/{consumer_idx}] "
+                            f"-> outgoing FIFO {predecessor.name}[{producer_idx}]"
+                        )
+
+        # Prepare the data
+        df_data = {
+            "onnx_index": [],
+            "node": [],
+            "stream": [],
+            "out_fifo_upper_bound": [],
+            "input_ready_percent": [],
+        }
+        for layer, layerdata in edited_bounds.items():
+            for idx in range(len(layerdata)):
+                df_data["onnx_index"].append([n.name for n in model.graph.node].index(layer))
+                df_data["node"].append(layer)
+                df_data["stream"].append(idx)
+                df_data["out_fifo_upper_bound"].append(layerdata[idx])
+                # TODO: Remove input_ready_percent?
+                # df_data["input_ready_percent"].append(self.percent_ready(data)[layer])
+                df_data["input_ready_percent"].append(0.0)
+
+        # Create the DF
+        self.fifo_data = pd.DataFrame(df_data)
+        log.info("First few entries of collected data:")
+        log.info(str(self.fifo_data))
+
+        # Save in dataframe and model
+        model = store_fifo_data(
+            model,
+            self.fifo_data,
+            self.default_fifo_data_path,
+            delete_existing=True,
+            store_html=True,
+        )
+
+        # TODO: Integrate data into the layer parallel simulation
+        return model, False
diff --git a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
index 966bff3d65..e7c88a7766 100644
--- a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
+++ b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
@@ -39,8 +39,7 @@
 
 
 def _get_signed_from_upstream(model, trunc_node):
-    """
-    Find out what the sign of the input to the trunc node is,
+    """Find out what the sign of the input to the trunc node is,
     by looking at the upstream nodes.
     """
     node = trunc_node
@@ -112,10 +111,36 @@ def _get_signed_from_upstream(model, trunc_node):
 
 
 class AvgPoolAndTruncToQuantAvgPool(Transformation):
-    """
-    Convert a section of nodes of the pattern:
+    """Convert a section of nodes of the pattern:
     AveragePool -> Mul (scalar) -> Trunc
-    To the FINN op: QuantAvgPool2d
+    To the FINN op: QuantAvgPool2d.
+    """
+
+    def apply(self, model):
+        opset_imports = model.get_opset_imports()
+        if "qonnx.custom_op.general" in opset_imports:
+            trunc_opset = opset_imports["qonnx.custom_op.general"]
+        elif "onnx.brevitas" in opset_imports:
+            trunc_opset = opset_imports["onnx.brevitas"]
+        else:
+            trunc_opset = 1  # Default to v1 if no opset found
+        if trunc_opset == 1:
+            model = model.transform(AvgPoolAndTruncv1ToQuantAvgPool())
+            return model, False
+        elif trunc_opset == 2:
+            model = model.transform(AvgPoolAndTruncv2ToQuantAvgPool())
+            return model, False
+        else:
+            raise NotImplementedError(
+                f"AvgPoolAndTruncToQuantAvgPool not implemented for "
+                f"Trunc opset version {trunc_opset}."
+            )
+
+
+class AvgPoolAndTruncv1ToQuantAvgPool(Transformation):
+    """Convert a section of nodes of the pattern:
+    AveragePool -> Mul (scalar) -> Trunc (v1)
+    To the FINN op: Div -> QuantAvgPool2d -> Mul.
     """
 
     def apply(self, model):
@@ -164,7 +189,7 @@ def apply(self, model):
                         k_s = get_by_name(n.attribute, "kernel_shape")
                         if k_s is None or len(k_s.ints) != 2 or len(set(k_s.ints)) != 1:
                             raise ValueError(
-                                "FINN only supports average pooling with " "2D square kernels."
+                                "FINN only supports average pooling with 2D square kernels."
                             )
                         k_s = k_s.ints[0]
 
@@ -197,7 +222,7 @@ def apply(self, model):
                         normalized_mode_string = rounding_mode.s.upper()
                         if rounding_mode is None or normalized_mode_string != b"FLOOR":
                             raise ValueError(
-                                "The Trunc node must have the rounding_mode " "set to 'FLOOR'."
+                                "The Trunc node must have the rounding_mode set to 'FLOOR'."
                             )
                         for inp in t_node.input[1:]:
                             if model.get_initializer(inp) is None:
@@ -314,10 +339,9 @@ def apply(self, model):
 
 
 class AvgPoolAndTruncv2ToQuantAvgPool(Transformation):
-    """
-    Convert a section of nodes of the pattern:
+    """Convert a section of nodes of the pattern:
     AveragePool -> Trunc (v2)
-    To the FINN op: Div -> QuantAvgPool2d -> Mul
+    To the FINN op: Div -> QuantAvgPool2d -> Mul.
     """
 
     def apply(self, model):
@@ -335,7 +359,7 @@ def apply(self, model):
                     k_s = get_by_name(node.attribute, "kernel_shape")
                     if k_s is None or len(k_s.ints) != 2 or len(set(k_s.ints)) != 1:
                         raise ValueError(
-                            "FINN only supports average pooling with " "2D square kernels."
+                            "FINN only supports average pooling with 2D square kernels."
                         )
                     k_s = k_s.ints[0]
 
@@ -346,7 +370,7 @@ def apply(self, model):
                     stride = get_by_name(node.attribute, "strides")
                     if stride is None or len(stride.ints) != 2 or len(set(stride.ints)) != 1:
                         raise ValueError(
-                            "FINN only supports 2D strides with equal values in " "each direction."
+                            "FINN only supports 2D strides with equal values in each direction."
                         )
                     stride = stride.ints[0]
 
@@ -355,7 +379,7 @@ def apply(self, model):
                     normalized_mode_string = rounding_mode.s.upper()
                     if rounding_mode is None or normalized_mode_string != b"FLOOR":
                         raise ValueError(
-                            "The Trunc node must have the rounding_mode " "set to 'FLOOR'."
+                            "The Trunc node must have the rounding_mode set to 'FLOOR'."
                         )
                     for inp in t_node.input[1:]:
                         if model.get_initializer(inp) is None:
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index f034f49bc5..503554fff6 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -164,7 +164,7 @@ def get_liveness_threshold_cycles():
     return int(os.getenv("LIVENESS_THRESHOLD", 1000000))
 
 
-def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str | Path:
+def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str:
     """Creates a folder with given prefix to be used as a build dir.
     Use this function instead of tempfile.mkdtemp to ensure any generated files
     will survive on the host after the FINN Docker container exits."""
@@ -185,7 +185,7 @@ def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str | Path
     return str(tmpdir)
 
 
-def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True):
+def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True, print_stderr=True):
     """Helper function to launch a process in a way that facilitates logging
     stdout/stderr with Python loggers.
     Returns (cmd_out, cmd_err) if successful, raises CalledProcessError otherwise."""
@@ -204,7 +204,7 @@ def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True):
     # Handle stderr, depending on return code
     if process.returncode == 0:
         # Process completed successfully, log stderr only as WARNING
-        if cmd_err:
+        if cmd_err and print_stderr:
             log.warning(cmd_err)
     else:
         # Process failed, log stderr as ERROR
diff --git a/src/finn/util/deprecated.py b/src/finn/util/deprecated.py
index 6985653ac0..8593aa8388 100644
--- a/src/finn/util/deprecated.py
+++ b/src/finn/util/deprecated.py
@@ -1,13 +1,12 @@
 """Implements a decorator to mark functions as deprecated."""
-
 import functools
+import warnings
 from collections.abc import Callable
 from typing import ParamSpec, TypeVar
-
 from finn.util.logging import log
 
 rT = TypeVar("rT")  # return type  # noqa: N816
-pT = ParamSpec("pT")  # parameters type # noqa: N816
+pT = ParamSpec("pT")  # parameters type  # noqa: N816
 
 
 def deprecated(func: Callable[pT, rT]) -> Callable[pT, rT]:
diff --git a/src/finn/util/logging.py b/src/finn/util/logging.py
index 3ae86f4dfd..50c73ef0ea 100644
--- a/src/finn/util/logging.py
+++ b/src/finn/util/logging.py
@@ -1,13 +1,12 @@
-"""Logging utilities for FINN using Rich console.
-
-This module provides logging configuration and utilities for FINN,
-including a Rich console for formatted output.
-"""
-
+"""Handle logging related functionality."""
 import logging
 from rich.console import Console
+from rich.progress import Progress, TaskID
+from threading import Lock
 from types import TracebackType
 
+log = logging.getLogger("finn_logger")
+
 # Top level console used by logger
 # Can be retrieved to create for example status displays in Rich
 _RICH_CONSOLE = Console()
@@ -36,8 +35,6 @@ def set_console(console: Console) -> None:
     _RICH_CONSOLE = console
 
 
-log = logging.getLogger("finn_logger")
-
 
 class LogDisabledConsole:
     """Use to get a console to use for Rich formatting without logging enabled."""
@@ -74,3 +71,65 @@ def __exit__(
             Exception traceback.
         """
         log.disabled = False
+
+
+class ThreadsafeProgressDisplay:
+    """Small helper to display multithreaded display bars.
+    Logging has to be disabled before usage.
+    """
+
+    def __init__(
+        self, tasks: list[str], totals: list[int | float], descriptions: list[str]
+    ) -> None:
+        """Create a new progress display."""
+        self.lock = Lock()
+        self.state: dict[str, int | float] = dict.fromkeys(tasks, 0)
+        self.ptasks: dict[str, TaskID] = {}
+        self.totals_state = dict(zip(tasks, totals, strict=True))
+
+        self.tasks: list[str] = tasks
+        self.totals: list[float | int] = totals
+        self.descriptions: list[str] = descriptions
+        assert len(tasks) == len(totals)
+        assert len(totals) == len(descriptions)
+
+    def start(self) -> None:
+        """Start the display."""
+        self.progress = Progress(transient=True, redirect_stdout=False, redirect_stderr=False)
+        self.progress.start()
+        for task, desc, total in zip(self.tasks, self.descriptions, self.totals, strict=True):
+            self.ptasks[task] = self.progress.add_task(desc, total=total)
+
+    def update(self, task: str, value: float | None = None, total: float | None = None) -> None:
+        """Update a value and the progress bar. If the task does not exist do nothing.
+        This is practical, because it means any method can update the progressbar
+        without any danger. Just the initially calling method must create a fitting display object.
+
+        If value is None, the value is incremented once.
+        """
+        if task in self.state and task in self.ptasks:
+            # NOTE: rich.progress at some point apparently became threadsafe,
+            # but just to be extra sure we add a lock here.
+            with self.lock:
+                if value is None:
+                    self.state[task] += 1
+                else:
+                    self.state[task] = value
+                if total is not None:
+                    self.totals_state[task] = total
+                self.progress.update(
+                    self.ptasks[task],
+                    completed=self.state[task],
+                    refresh=True,
+                    total=self.totals_state[task],
+                )
+
+    def stop(self) -> None:
+        """Stop the display."""
+        self.progress.stop()
+
+    def __enter__(self) -> None:
+        self.start()
+
+    def __exit__(self, tp, vl, tb) -> None:
+        self.stop()
diff --git a/src/finn/xsi/setup.py b/src/finn/xsi/setup.py
index 609eabd88c..3aa6c91523 100644
--- a/src/finn/xsi/setup.py
+++ b/src/finn/xsi/setup.py
@@ -72,7 +72,7 @@ def get_build_paths() -> Tuple[List[str], str, List[str]]:
         compiler = "clang++"
 
     # Compile flags
-    extra_compile_args = ["--std=c++17", "-Wall", "-O3", "-shared", "-fPIC"]
+    extra_compile_args = ["--std=c++20", "-Wall", "-O3", "-shared", "-fPIC"]
 
     return include_dirs, compiler, extra_compile_args
 
@@ -147,7 +147,7 @@ def build_xsi(force: bool = False, verbose: bool = True) -> bool:
     include_dirs, compiler, compile_args = get_build_paths()
 
     # Source files
-    source_files = ["xsi_bind.cpp", "xsi_finn.cpp"]
+    source_files = ["xsi_bind.cpp", "src/Port.cpp", "src/Design.cpp", "src/Kernel.cpp", "src/SharedLibrary.cpp"]
 
     # Build command
     cmd = [compiler] + compile_args
@@ -155,6 +155,7 @@ def build_xsi(force: bool = False, verbose: bool = True) -> bool:
     # Add include directories
     for inc_dir in include_dirs:
         cmd.extend(["-I", inc_dir])
+    cmd.extend(["-I", "./include"])
 
     # Output file
     cmd.extend(["-o", "xsi.so"])
diff --git a/tests/fpgadataflow/test_bram_block_search.py b/tests/fpgadataflow/test_bram_block_search.py
new file mode 100644
index 0000000000..2542fde769
--- /dev/null
+++ b/tests/fpgadataflow/test_bram_block_search.py
@@ -0,0 +1,465 @@
+"""Test BRAM block calculations and search algorithms."""
+# ruff: noqa: ANN201, SLF001
+
+import pytest
+
+import math
+
+from finn.transformation.fpgadataflow.simulation import (
+    calculate_bram_blocks,
+    calculate_bram_depth_range,
+)
+
+
+class TestBRAMBlockCalculations:
+    """Test BRAM block calculation functions."""
+
+    def test_calculate_bram_blocks_bitwidth_1(self) -> None:
+        """Test BRAM block calculation for 1-bit data."""
+        assert calculate_bram_blocks(1, 1) == 1
+        assert calculate_bram_blocks(16384, 1) == 1
+        assert calculate_bram_blocks(16385, 1) == 2
+        assert calculate_bram_blocks(32768, 1) == 2
+
+    def test_calculate_bram_blocks_bitwidth_2(self) -> None:
+        """Test BRAM block calculation for 2-bit data."""
+        assert calculate_bram_blocks(1, 2) == 1
+        assert calculate_bram_blocks(8192, 2) == 1
+        assert calculate_bram_blocks(8193, 2) == 2
+        assert calculate_bram_blocks(16384, 2) == 2
+
+    def test_calculate_bram_blocks_bitwidth_4(self) -> None:
+        """Test BRAM block calculation for 4-bit data."""
+        assert calculate_bram_blocks(1, 4) == 1
+        assert calculate_bram_blocks(4096, 4) == 1
+        assert calculate_bram_blocks(4097, 4) == 2
+        assert calculate_bram_blocks(8192, 4) == 2
+
+    def test_calculate_bram_blocks_bitwidth_9(self) -> None:
+        """Test BRAM block calculation for 9-bit data."""
+        assert calculate_bram_blocks(1, 9) == 1
+        assert calculate_bram_blocks(2048, 9) == 1
+        assert calculate_bram_blocks(2049, 9) == 2
+
+    def test_calculate_bram_blocks_bitwidth_18(self) -> None:
+        """Test BRAM block calculation for 18-bit data."""
+        assert calculate_bram_blocks(1, 18) == 1
+        assert calculate_bram_blocks(1024, 18) == 1
+        assert calculate_bram_blocks(1025, 18) == 2
+
+    def test_calculate_bram_blocks_wide_bitwidth_deep(self) -> None:
+        """Test BRAM block calculation for wide bitwidth with depth > 512."""
+        # bitwidth = 40, depth = 1024 > 512
+        # Uses formula: ⌈1024/1024⌉ * ⌈40/18⌉ = 1 * 3 = 3
+        assert calculate_bram_blocks(1024, 40) == 3
+
+    def test_calculate_bram_blocks_wide_bitwidth_shallow(self) -> None:
+        """Test BRAM block calculation for wide bitwidth with depth <= 512."""
+        # bitwidth = 40, depth = 512 <= 512
+        # Uses formula: ⌈512/512⌉ * ⌈40/36⌉ = 1 * 2 = 2
+        assert calculate_bram_blocks(512, 40) == 2
+
+
+class TestBRAMDepthRange:
+    """Test BRAM depth range inversion function."""
+
+    def test_depth_range_bitwidth_1(self) -> None:
+        """Test depth range calculation for 1-bit data."""
+        min_d, max_d = calculate_bram_depth_range(1, 1)
+        assert min_d == 1
+        assert max_d == 16384
+        assert calculate_bram_blocks(min_d, 1) == 1
+        assert calculate_bram_blocks(max_d, 1) == 1
+
+        min_d, max_d = calculate_bram_depth_range(2, 1)
+        assert min_d == 16385
+        assert max_d == 32768
+        assert calculate_bram_blocks(min_d, 1) == 2
+        assert calculate_bram_blocks(max_d, 1) == 2
+
+    def test_depth_range_bitwidth_4(self) -> None:
+        """Test depth range calculation for 4-bit data."""
+        min_d, max_d = calculate_bram_depth_range(1, 4)
+        assert min_d == 1
+        assert max_d == 4096
+        assert calculate_bram_blocks(min_d, 4) == 1
+        assert calculate_bram_blocks(max_d, 4) == 1
+
+    def test_depth_range_bitwidth_5_valid_blocks(self) -> None:
+        """Test block count validation for bitwidth=5."""
+        # bitwidth=5 uses ⌈5/9⌉=1 bitwidth factor (falls in <=9 range)
+        # So all blocks should be valid
+        min_d, max_d = calculate_bram_depth_range(1, 5)
+        assert max_d > 0, "1 block should be valid for bitwidth=5"
+        assert calculate_bram_blocks(min_d, 5) == 1
+        assert calculate_bram_blocks(max_d, 5) == 1
+
+        min_d, max_d = calculate_bram_depth_range(2, 5)
+        assert max_d > 0, "2 blocks should be valid for bitwidth=5"
+        assert calculate_bram_blocks(min_d, 5) == 2
+        assert calculate_bram_blocks(max_d, 5) == 2
+
+    def test_depth_range_bitwidth_10_valid_blocks(self) -> None:
+        """Test block count validation for bitwidth=10."""
+        # bitwidth=10 uses ⌈10/18⌉=1 bitwidth factor (falls in <=18 range)
+        min_d, max_d = calculate_bram_depth_range(1, 10)
+        assert max_d > 0
+        assert calculate_bram_blocks(min_d, 10) == 1
+
+        min_d, max_d = calculate_bram_depth_range(2, 10)
+        assert max_d > 0
+        assert calculate_bram_blocks(min_d, 10) == 2
+
+    def test_depth_range_wide_bitwidth(self) -> None:
+        """Test depth range for wide bitwidths > 18."""
+        # bitwidth=40 has two modes depending on depth
+        min_d, max_d = calculate_bram_depth_range(2, 40)
+        # Should use depth ≤ 512 mode: ⌈depth/512⌉ * ⌈40/36⌉
+        # 2 blocks / 2 = 1 depth_blocks → (1, 512)
+        if max_d > 0:
+            assert max_d <= 512
+            assert calculate_bram_blocks(min_d, 40) == 2
+
+    def test_depth_range_consistency_all_bitwidths(self) -> None:
+        """Test that all valid ranges actually produce the correct block count."""
+        for bitwidth in range(1, 8192):
+            for blocks in range(1, 1024):
+                min_d, max_d = calculate_bram_depth_range(blocks, bitwidth)
+                if max_d > 0:  # Valid configuration
+                    # Verify both endpoints produce correct block count
+                    assert calculate_bram_blocks(min_d, bitwidth) == blocks, (
+                        f"Min depth {min_d} for {blocks} blocks, "
+                        f"bitwidth {bitwidth} produces wrong count"
+                    )
+                    assert calculate_bram_blocks(max_d, bitwidth) == blocks, (
+                        f"Max depth {max_d} for {blocks} blocks, "
+                        f"bitwidth {bitwidth} produces wrong count"
+                    )
+
+                    # Verify just outside the range produces different counts
+                    if min_d > 1:
+                        assert calculate_bram_blocks(min_d - 1, bitwidth) < blocks
+                    assert calculate_bram_blocks(max_d + 1, bitwidth) > blocks
+
+
+class TestGetValidBlockCounts:
+    """Test the _get_valid_block_counts helper method."""
+
+    def test_all_valid_bitwidth_1(self) -> None:
+        """Test that all block counts are valid for bitwidth=1."""
+        from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation
+
+        # Create dummy instance just to test the method
+        sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation)
+
+        valid_blocks = sim._get_valid_block_counts(1, 10, 1)
+        assert valid_blocks == list(range(1, 11))
+
+    def test_wide_bitwidth_filtering(self) -> None:
+        """Test that some block counts may be invalid for wide bitwidths."""
+        from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation
+
+        sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation)
+
+        # For bitwidth > 18, some block counts may be invalid
+        valid_blocks = sim._get_valid_block_counts(1, 20, 40)
+        # Verify all returned blocks produce valid ranges
+        for b in valid_blocks:
+            _, max_d = calculate_bram_depth_range(b, 40)
+            assert max_d > 0, f"Block {b} should produce valid range"
+
+    def test_range_respects_bounds(self) -> None:
+        """Test that valid blocks respect min/max bounds."""
+        from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation
+
+        sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation)
+
+        valid_blocks = sim._get_valid_block_counts(5, 15, 1)
+        assert min(valid_blocks) >= 5
+        assert max(valid_blocks) <= 15
+        assert len(valid_blocks) == 11
+
+    def test_empty_when_no_valid_in_range(self) -> None:
+        """Test that empty list is returned when no valid configs exist in range."""
+        from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation
+
+        sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation)
+
+        # Test a scenario where the range might have no valid blocks
+        # (this is rare but the method should handle it)
+        valid_blocks = sim._get_valid_block_counts(100, 99, 5)  # Invalid range
+        assert valid_blocks == []
+
+
+class TestExponentialBinarySearchLogic:
+    """Test the exponential + binary search algorithm logic (without actual simulation)."""
+
+    def test_exponential_indices_progression(self) -> None:
+        """Test that exponential search correctly progresses through indices."""
+        # Simulate the exponential index progression
+        valid_blocks = list(range(1, 101))  # 100 valid blocks
+
+        # Exponential progression should be: 0, 1, 2, 4, 8, 16, 32, 64...
+        exp_idx = 0
+        indices_checked = []
+
+        while exp_idx < len(valid_blocks) - 1:
+            indices_checked.append(exp_idx)
+            exp_idx = min(exp_idx * 2 if exp_idx > 0 else 1, len(valid_blocks) - 1)
+
+        assert indices_checked == [0, 1, 2, 4, 8, 16, 32, 64]
+
+    def test_binary_search_reduces_range(self) -> None:
+        """Test that binary search correctly narrows the range."""
+        lower_idx = 0
+        upper_idx = 99
+
+        iterations = 0
+        while lower_idx < upper_idx:
+            mid_idx = (lower_idx + upper_idx) // 2
+            # Simulate "success" for indices < 50
+            if mid_idx < 50:
+                upper_idx = mid_idx
+            else:
+                lower_idx = mid_idx + 1
+            iterations += 1
+
+            # Prevent infinite loop in test
+            if iterations > 20:
+                break
+
+        assert lower_idx == upper_idx
+        assert iterations <= 7  # log2(100) ≈ 6.6
+
+
+class TestSRL16ELUTCalculations:
+    """Test SRL16E LUT calculation functions."""
+
+    def test_calculate_srl16e_luts_basic(self):
+        """Test basic SRL16E LUT calculations."""
+        from finn.transformation.fpgadataflow.simulation import calculate_srl16e_luts
+
+        # Formula: LUTs = ⌈depth/32⌉ * ⌈bitwidth/2⌉
+        # depth=32, bitwidth=2: ⌈32/32⌉ * ⌈2/2⌉ = 1 * 1 = 1
+        assert calculate_srl16e_luts(32, 2) == 1
+
+        # depth=64, bitwidth=2: ⌈64/32⌉ * ⌈2/2⌉ = 2 * 1 = 2
+        assert calculate_srl16e_luts(64, 2) == 2
+
+        # depth=32, bitwidth=4: ⌈32/32⌉ * ⌈4/2⌉ = 1 * 2 = 2
+        assert calculate_srl16e_luts(32, 4) == 2
+
+        # depth=33, bitwidth=2: ⌈33/32⌉ * ⌈2/2⌉ = 2 * 1 = 2
+        assert calculate_srl16e_luts(33, 2) == 2
+
+    def test_calculate_srl16e_luts_various_bitwidths(self):
+        """Test SRL16E LUT calculations for various bitwidths."""
+        from finn.transformation.fpgadataflow.simulation import calculate_srl16e_luts
+
+        # Bitwidth 1: ⌈1/2⌉ = 1
+        assert calculate_srl16e_luts(32, 1) == 1
+        assert calculate_srl16e_luts(64, 1) == 2
+
+        # Bitwidth 3: ⌈3/2⌉ = 2
+        assert calculate_srl16e_luts(32, 3) == 2
+        assert calculate_srl16e_luts(64, 3) == 4
+
+        # Bitwidth 8: ⌈8/2⌉ = 4
+        assert calculate_srl16e_luts(32, 8) == 4
+        assert calculate_srl16e_luts(64, 8) == 8
+
+    def test_calculate_srl16e_luts_small_depths(self):
+        """Test SRL16E LUT calculations for small depths."""
+        from finn.transformation.fpgadataflow.simulation import calculate_srl16e_luts
+
+        # Small depths still use at least 1 LUT per bitwidth factor
+        assert calculate_srl16e_luts(2, 2) == 1
+        assert calculate_srl16e_luts(16, 2) == 1
+        assert calculate_srl16e_luts(31, 2) == 1
+
+
+class TestSRL16EDepthRange:
+    """Test SRL16E depth range inversion function."""
+
+    def test_depth_range_basic(self):
+        """Test basic depth range calculation for SRL16E."""
+        from finn.transformation.fpgadataflow.simulation import (
+            calculate_srl16e_depth_range,
+            calculate_srl16e_luts,
+        )
+
+        # 1 LUT, bitwidth=2
+        min_d, max_d = calculate_srl16e_depth_range(1, 2)
+        assert min_d == 2
+        assert max_d == 32
+        assert calculate_srl16e_luts(min_d, 2) == 1
+        assert calculate_srl16e_luts(max_d, 2) == 1
+
+    def test_depth_range_bitwidth_1(self):
+        """Test depth range for 1-bit data."""
+        from finn.transformation.fpgadataflow.simulation import (
+            calculate_srl16e_depth_range,
+            calculate_srl16e_luts,
+        )
+
+        min_d, max_d = calculate_srl16e_depth_range(1, 1)
+        assert min_d == 2
+        assert max_d == 32
+        assert calculate_srl16e_luts(min_d, 1) == 1
+        assert calculate_srl16e_luts(max_d, 1) == 1
+
+        min_d, max_d = calculate_srl16e_depth_range(2, 1)
+        assert min_d == 33
+        assert max_d == 64
+        assert calculate_srl16e_luts(min_d, 1) == 2
+        assert calculate_srl16e_luts(max_d, 1) == 2
+
+    def test_depth_range_invalid_odd_luts(self):
+        """Test that odd LUT counts are invalid for certain bitwidths."""
+        from finn.transformation.fpgadataflow.simulation import calculate_srl16e_depth_range
+
+        # Bitwidth=4: ⌈4/2⌉ = 2, so only even LUT counts are valid
+        _, max_d = calculate_srl16e_depth_range(1, 4)
+        assert max_d == 0, "1 LUT should be invalid for bitwidth=4"
+
+        _, max_d = calculate_srl16e_depth_range(2, 4)
+        assert max_d > 0, "2 LUTs should be valid for bitwidth=4"
+
+    def test_depth_range_consistency(self):
+        """Test that all valid ranges produce the correct LUT count."""
+        from finn.transformation.fpgadataflow.simulation import (
+            calculate_srl16e_depth_range,
+            calculate_srl16e_luts,
+        )
+
+        for bitwidth in [1, 2, 3, 4, 8, 16]:
+            for luts in range(1, 20):
+                min_d, max_d = calculate_srl16e_depth_range(luts, bitwidth)
+                if max_d > 0:  # Valid configuration
+                    # Verify both endpoints produce correct LUT count
+                    assert calculate_srl16e_luts(min_d, bitwidth) == luts, (
+                        f"Min depth {min_d} for {luts} LUTs, "
+                        f"bitwidth {bitwidth} produces wrong count"
+                    )
+                    assert calculate_srl16e_luts(max_d, bitwidth) == luts, (
+                        f"Max depth {max_d} for {luts} LUTs, "
+                        f"bitwidth {bitwidth} produces wrong count"
+                    )
+
+                    # Verify just outside the range produces different counts
+                    if min_d > 2:
+                        assert calculate_srl16e_luts(min_d - 1, bitwidth) < luts
+                    assert calculate_srl16e_luts(max_d + 1, bitwidth) > luts
+
+
+class TestNeedsMinimization:
+    """Test the needs_minimization method."""
+
+    # TODO: Maybe remove this behavior
+    def test_small_depths_no_minimization(self):
+        """Test that small depths don't need minimization."""
+        from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation
+
+        sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation)
+        sim.max_qsrl_depth = 256
+
+        # Depths <= 32 don't need minimization (fit in bitwidth/2 LUTs)
+        assert not sim._needs_minimization(32, 8)
+        assert not sim._needs_minimization(16, 8)
+        assert not sim._needs_minimization(2, 8)
+
+    # TODO: Maybe remove this behavior
+    def test_qsrl_range_no_minimization(self):
+        """Test that depths within QSRL range don't need minimization."""
+        from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation
+
+        sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation)
+        sim.max_qsrl_depth = 256
+
+        # Depths within max_qsrl_depth don't need minimization
+        assert not sim._needs_minimization(128, 8)
+        assert not sim._needs_minimization(256, 8)
+
+    def test_large_depths_need_minimization(self):
+        """Test that large depths with multiple BRAM blocks need minimization."""
+        from finn.transformation.fpgadataflow.simulation import (
+            RunLayerParallelSimulation,
+            calculate_bram_blocks,
+            calculate_bram_depth_range,
+        )
+
+        sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation)
+        sim.max_qsrl_depth = 256
+
+        # Test with specific known cases first
+        # bitwidth=8: 1 BRAM range is (1, 2048)
+        # Use depth > 2048 to get multiple blocks
+        depth = 5000
+        bitwidth = 8
+        blocks = calculate_bram_blocks(depth, bitwidth)
+        assert blocks > 1, f"depth={depth}, bitwidth={bitwidth} should use >1 BRAM"
+        assert sim._needs_minimization(depth, bitwidth)
+
+        # bitwidth=18: 1 BRAM range is (1, 1024)
+        # Use depth > 1024 to get multiple blocks
+        depth = 3000
+        bitwidth = 18
+        blocks = calculate_bram_blocks(depth, bitwidth)
+        assert blocks > 1, f"depth={depth}, bitwidth={bitwidth} should use >1 BRAM"
+        assert sim._needs_minimization(depth, bitwidth)
+
+        # Verify that depth with 1 BRAM doesn't need minimization
+        # when it's at minimum block count
+        depth = 1000
+        bitwidth = 8
+        blocks = calculate_bram_blocks(depth, bitwidth)
+        assert blocks == 1
+        assert not sim._needs_minimization(depth, bitwidth)
+
+        # Exhaustive test: check that depths with MORE than minimum BRAM blocks
+        # need minimization (unless very close to QSRL threshold)
+        for bw in range(1, 64):
+            # Find the minimum achievable block count for this bitwidth
+            min_blocks = None
+            max_d = 0
+            test_blocks = 1
+            while max_d == 0:
+                _, max_d = calculate_bram_depth_range(test_blocks, bw)
+                if max_d > 0:
+                    min_blocks = test_blocks
+                    break
+                test_blocks += 1
+
+            if min_blocks is None:
+                continue  # Skip if no valid config found
+
+            # Test depths that use more blocks than minimum
+            for depth in range(1, 8192):
+                blocks = calculate_bram_blocks(depth, bw)
+
+                # Only expect minimization if blocks > minimum achievable
+                if blocks > min_blocks and depth > math.floor(sim.max_qsrl_depth * 1.1):
+                    assert sim._needs_minimization(depth, bw), (
+                        f"depth={depth}, bw={bw}, blocks={blocks}, min_blocks={min_blocks} "
+                        f"should need minimization"
+                    )
+
+    def test_minimum_bram_edge_case(self):
+        """Test edge case at minimum BRAM blocks."""
+        from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation
+
+        sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation)
+        sim.max_qsrl_depth = 256
+
+        # A depth that's just slightly above max_qsrl_depth with minimum BRAM blocks
+        # The behavior depends on whether it's deemed too close to optimize
+        depth = 300
+        bitwidth = 1
+
+        # Verify the method executes without error
+        result = sim._needs_minimization(depth, bitwidth)
+        assert isinstance(result, bool)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])