diff --git a/.gitignore b/.gitignore index e830c1d3c9..ad0f998716 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,19 @@ tags poetry.lock *.code-workspace .env +*.vim + +# Cmake files +**/CMakeFiles +**/cmake_install.cmake +**/CMakeCache.txt +**/compile_commands.json +**/.cache +**/build +**/_deps +finn_xsi/finn_xsi/unittests/*.cmake +finn_xsi/finn_xsi/unittests/Makefile + settings.yaml */.cache/* @@ -100,6 +113,9 @@ MANIFEST /data/ *.csv +# Mock templated simulation config +finn_xsi/finn_xsi/rtlsim_config.hpp + # Google Drive key for dashboard /gdrive-key/ @@ -108,7 +124,7 @@ MANIFEST # downloaded dep repos /deps/ -/finn_deps/ +finn_deps/ # local test directories for benchmarking infrastructure bench_input diff --git a/finn-rtllib/removedatapath/hdl/dummy_template.v b/finn-rtllib/removedatapath/hdl/dummy_template.v new file mode 100644 index 0000000000..36dec63915 --- /dev/null +++ b/finn-rtllib/removedatapath/hdl/dummy_template.v @@ -0,0 +1,25 @@ +module $TOP_MODULE_NAME$( +//- Global Control ------------------ +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out0_V, ASSOCIATED_RESET = ap_rst_n" *) +(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) +input ap_rst_n, + +//- AXI Stream - Input -------------- +output in0_V_TREADY, +input in0_V_TVALID, +input [$WIDTH$-1:0] in0_V_TDATA, + +//- AXI Stream - Output -------------- +input out0_V_TREADY, +output out0_V_TVALID, +output [$WIDTH$-1:0] out0_V_TDATA +); + +assign in0_V_TREADY = out0_V_TREADY; +assign out0_V_TVALID = in0_V_TVALID; +assign out0_V_TDATA = 0; + + +endmodule diff --git a/finn_xsi/finn_xsi/.clang-format b/finn_xsi/finn_xsi/.clang-format new file mode 100644 index 0000000000..2df30c132e --- /dev/null +++ b/finn_xsi/finn_xsi/.clang-format @@ -0,0 +1,46 @@ +BasedOnStyle: Chromium +AccessModifierOffset: '1' +AlignAfterOpenBracket: Align +AlignConsecutiveMacros: 'true' +AlignTrailingComments: 'true' +AllowAllArgumentsOnNextLine: 'true' +AllowShortBlocksOnASingleLine: 'true' +AllowShortFunctionsOnASingleLine: 'true' +AllowShortCaseLabelsOnASingleLine: 'false' +AlwaysBreakTemplateDeclarations: 'Yes' +BinPackParameters: 'true' +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +ColumnLimit: '180' +Cpp11BracedListStyle: 'true' +FixNamespaceComments: 'true' +IndentCaseLabels: 'true' +IndentPPDirectives: BeforeHash +IndentWidth: '4' +IndentWrappedFunctionNames: 'true' +IncludeBlocks: Regroup +KeepEmptyLinesAtTheStartOfBlocks: 'false' +Language: Cpp +MaxEmptyLinesToKeep: '2' +NamespaceIndentation: All +PointerAlignment: Left +ReflowComments: 'true' +SortIncludes: 'true' +SortUsingDeclarations: 'true' +SpaceAfterCStyleCast: 'true' +SpaceAfterLogicalNot: 'false' +SpaceAfterTemplateKeyword: 'false' +SpaceBeforeCpp11BracedList: 'false' +SpaceBeforeCtorInitializerColon: 'true' +SpaceBeforeInheritanceColon: 'true' +SpaceInEmptyParentheses: 'false' +SpacesInAngles: 'false' +SpacesInCStyleCastParentheses: 'false' +SpacesInContainerLiterals: 'false' +SpacesInParentheses: 'false' +SpacesInSquareBrackets: 'false' +TabWidth: '4' +--- +Language: Json +BasedOnStyle: llvm diff --git a/finn_xsi/finn_xsi/CMakeLists.txt b/finn_xsi/finn_xsi/CMakeLists.txt new file mode 100644 index 0000000000..5f4daa4c12 --- /dev/null +++ b/finn_xsi/finn_xsi/CMakeLists.txt @@ -0,0 +1,134 @@ +cmake_minimum_required(VERSION 3.11) +project(LayerSimulationBackend) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +# Require C++20 +set(CMAKE_CXX_EXTENSIONS ON) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +message(STATUS "Using C++ Standard ${CMAKE_CXX_STANDARD}") +SET(CMAKE_COLOR_MAKEFILE ON) + +message(STATUS "CMake cwd: ${CMAKE_CURRENT_SOURCE_DIR}") + +# Export compile commands for clangd +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# INCLUDES + +#Threads +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) + +#OpenMP +find_package(OpenMP REQUIRED) + +#Compiler Options +add_library(fifosim_options INTERFACE) +add_library(fifosim::options ALIAS fifosim_options) + +OPTION(FIFOSIM_ENABLE_ALLOPT "Enable all optimizations" ON) +if(${FIFOSIM_ENABLE_ALLOPT}) + message(STATUS "All optimizations are enabled") + target_compile_options( + fifosim_options + INTERFACE -Ofast -ffast-math -march=native -mtune=native -fstack-protector-strong -fopenmp -ffunction-sections -fdata-sections -pipe -funroll-loops -shared -fPIC -Wno-interference-size + # Additional performance options: + -flto=auto # Link-time optimization (auto-detect thread count) + -fno-plt # Avoid PLT for better performance with shared libs + -fno-semantic-interposition # Allow more aggressive optimization in shared libs + -ftree-vectorize # Enable auto-vectorization (usually on with -O3) + -fvect-cost-model=dynamic # Better vectorization cost model + -fprefetch-loop-arrays # Prefetch arrays in loops + -fno-math-errno # Don't set errno for math functions (covered by -ffast-math mostly) + -fno-trapping-math # Allow optimizations that may trap (part of -ffast-math) + -ffinite-math-only # Assume no NaN/Inf (part of -ffast-math) + -fassociative-math # Allow reassociation (part of -ffast-math) + ) + target_link_options( + fifosim_options + INTERFACE + -flto=auto # LTO at link time + -Wl,--gc-sections # Remove unused sections + -Wl,--as-needed # Only link needed libraries + -Wl,-O3 # Linker optimization level + -Wl,--hash-style=gnu # Faster symbol lookup +) + #target_link_options(fifosim_options INTERFACE -fsanitize=undefined,address) +endif() + +### Enable compiler warnings +option(FIFOSIM_ENABLE_WARNINGS "Enable warnings" ON) +if (FIFOSIM_ENABLE_WARNINGS) + include(cmake/CompilerWarnings.cmake) + fifosim_set_project_warnings( + fifosim_options + OFF + "" + "" + "" + "") +endif (FIFOSIM_ENABLE_WARNINGS) + +# Use ccache if available +find_program(CCACHE_PROGRAM ccache) +if(CCACHE_PROGRAM) + message(STATUS "Using ccache for builds") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") +endif() + +# +# Create options for including cmake files from the cmake folder with a bit of output. +# +macro(check_include) + if(NOT ${ARGC} EQUAL 3) + message(FATAL_ERROR "Call to 'check_include' with ${ARGC} arguments instead of 3") + endif() + OPTION(${ARGV0} "Enable ${ARGV0}" ON) + if (${ARGV0}) + message(STATUS "${ARGV1}: enabled") + include(cmake/${ARGV2}) + else() + message(STATUS "${ARGV1}: disabled") + endif() +endmacro() + +message(STATUS "Checks:") +list(APPEND CMAKE_MESSAGE_INDENT " ") #indent +1 +check_include(FIFOSIM_IPO "InterproceduralOptimization" InterproceduralOptimization.cmake) +list(POP_BACK CMAKE_MESSAGE_INDENT) #indent -1 + +# Collect source files +file(GLOB_RECURSE CORE_SRC src/*.cpp) + +# For JSON writing +include(FetchContent) +FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.12.0/json.tar.xz) +FetchContent_MakeAvailable(json) + +# Add boost for PO +find_package(Boost COMPONENTS program_options REQUIRED) + +# Build the simulation library +add_library(SimulationBackendLib SHARED ${CORE_SRC}) +target_include_directories(SimulationBackendLib PUBLIC "${CMAKE_BINARY_DIR}") # Include the rtlsim wrapper directory itself +target_include_directories(SimulationBackendLib PUBLIC "$ENV{XILINX_VIVADO}/data/xsim/include") # Add xsim includes +target_include_directories(SimulationBackendLib PUBLIC "include") +target_link_libraries(SimulationBackendLib PUBLIC fifosim::options nlohmann_json::nlohmann_json Threads::Threads OpenMP::OpenMP_CXX -ldl -lrt) + +# Build the executable for connected simulations +add_executable(LayerSimulationBackend LayerSimulationBackend.cpp) +target_include_directories(LayerSimulationBackend SYSTEM PUBLIC ${Boost_INCLUDE_DIRS}) +target_link_libraries(LayerSimulationBackend SimulationBackendLib Boost::program_options) + +# Build the executable for isolated simulations +add_executable(IsolatedSimulationBackend IsolatedSimulationBackend.cpp) +target_include_directories(IsolatedSimulationBackend SYSTEM PUBLIC ${Boost_INCLUDE_DIRS}) +target_link_libraries(IsolatedSimulationBackend SimulationBackendLib Boost::program_options) + +OPTION(ENABLE_UNITTESTS "Enable unittests" OFF) +if(${ENABLE_UNITTESTS}) +add_subdirectory(unittests) +endif() diff --git a/finn_xsi/finn_xsi/IsolatedSimulationBackend.cpp b/finn_xsi/finn_xsi/IsolatedSimulationBackend.cpp new file mode 100644 index 0000000000..92de4a1772 --- /dev/null +++ b/finn_xsi/finn_xsi/IsolatedSimulationBackend.cpp @@ -0,0 +1,153 @@ +#include +#include +#include +#include +#include +#include + +namespace po = boost::program_options; + + +std::string getTime() { + auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + auto formatted = std::put_time(std::localtime(&now), "[%T]"); + std::stringstream ss; + ss << formatted; + return ss.str(); +} + + +int main(int argc, const char* argv[]) { + // Parse CLI options + po::options_description desc{"Options"}; + desc.add_options()("socket,s", po::value(), "Unix domain socket path for IPC"); + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + // Create simulation + IsolatedSimulation sim( + RTLSimConfig::kernel_libname, + RTLSimConfig::design_libname, + "xsim_log_file.txt", + "trace_file.wdb", + RTLSimConfig::istream_descs, + RTLSimConfig::ostream_descs + ); + + + + // Create controller + if (vm.count("socket")) { + const std::string socket_path = vm["socket"].as(); + std::cout << "Initializing socket server at: " << socket_path << std::endl; + std::cout.flush(); + + SocketServer server(socket_path); + if (auto error = server.initialize(); error.has_value()) { + std::cerr << "Failed to initialize socket server: " << *error << std::endl; + std::cerr.flush(); + return 1; + } + + std::cout << "Socket server initialized, waiting for commands..." << std::endl; + std::cout.flush(); + + // Preparing thread variable + std::optional simThread = std::nullopt; + std::mutex simMutex; + + // Command processing loop + std::size_t cycles = 0; + std::size_t statusSent = 0; + json response; + while (true) { + response = json::object(); + // Read message + std::cout << getTime() << " Awaiting message..." << std::endl; + auto request = server.receive_message(); + if (!request.has_value()) { + std::cout << getTime() << " Connection closed or error occurred" << std::endl; + break; + } + + // Process message + std::string command = (*request)["command"]; + std::cout << getTime() << " [Received command] " << command << std::endl; + if (command == "start") { + std::cout << getTime() << " Starting simulation" << std::endl; + if (!simThread.has_value()) { + simThread = std::jthread([&sim, &simMutex, &cycles](std::stop_token stop) { + { + std::lock_guard guard(simMutex); + sim.simulate(true); + } + std::cout << getTime() << " Simulation initialized. Going into main loop." << std::endl; + while (!stop.stop_requested()) { + std::lock_guard guard(simMutex); + if (cycles % 10000 == 0) { + std::cout << cycles << " " << sim.getStatus() << std::endl; + } + sim.simulate(false); + ++cycles; + if (sim.isDone()) { + // For now do not clean up the JSON logs, as this is + // done by the "stop" command from the python side of things. + // TODO: However this should be changed when the communication is + // rewritten + sim.commitLogsToDisk(false); + break; + } + } + }); + } else { + std::lock_guard guard(simMutex); + sim.resume(); + } + response["state"] = "running"; + server.send_message(response); + } else if (command == "stop") { + std::cout << getTime() << " Stopping simulation." << std::endl; + std::lock_guard guard(simMutex); + std::cout << getTime() << " Final status: " << sim.getStatus() << std::endl; + std::cout << getTime() << " Is done? " << sim.isDone() << std::endl; + sim.halt(); + if (simThread.has_value()) { + simThread->request_stop(); + } + sim.commitLogsToDisk(true); + response["state"] = "stopped"; + server.send_message(response); + } else if (command == "pause") { + std::cout << getTime() << " Pausing simulation." << std::endl; + std::lock_guard guard(simMutex); + if (simThread.has_value()) { + simThread->request_stop(); + } + response["state"] = "halted"; + server.send_message(response); + } else if (command == "status") { + std::cout << getTime() << " [Sending] Sending status update " << statusSent + 1 << std::endl; + std::lock_guard guard(simMutex); + json status = sim.getStatus(); + server.send_message(status); + statusSent++; + std::cout << getTime() << " [Sending] Status " << statusSent << " update sent!" << std::endl; + } else { + std::cout << getTime() << " Unknown command " << command << std::endl; + std::cerr << "Unknown command " << command << std::endl; + response["state"] = "unknown_command"; + server.send_message(response); + } + + // Exit if stop command received + if ((*request)["command"] == "stop") { + break; + } + } + simThread->join(); + } else { + throw std::runtime_error("Socket path not provided. Socket communication is required."); + } + return 0; +} diff --git a/finn_xsi/finn_xsi/LayerSimulationBackend.cpp b/finn_xsi/finn_xsi/LayerSimulationBackend.cpp new file mode 100644 index 0000000000..314aa0a41a --- /dev/null +++ b/finn_xsi/finn_xsi/LayerSimulationBackend.cpp @@ -0,0 +1,372 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NDEBUG +#include +#include + +namespace po = boost::program_options; + +constexpr std::size_t InstreamCount = RTLSimConfig::istream_descs.size(); +constexpr std::size_t OutstreamCount = RTLSimConfig::ostream_descs.size(); + +static_assert(InstreamCount == RTLSimConfig::inputInterfaceNames.size(), "Number of input streams must match number of previous nodes"); +static_assert(OutstreamCount == RTLSimConfig::outputInterfaceNames.size(), "Number of output streams must match number of next nodes"); + +// Simulation state management +enum class SimulationState { IDLE, CONFIGURED, RUNNING, FINISHED, ERROR }; + +class SimulationController { + private: + SingleNodeSimulation& sim; + std::atomic state{SimulationState::IDLE}; + std::atomic current_cycles{0}; + std::atomic current_samples{0}; + std::mutex state_mutex; + std::string error_message; + std::jthread sim_thread; + std::vector fifo_depths{2}; + std::size_t max_cycles{std::numeric_limits::max()}; + bool timeout_occurred{false}; + + public: + explicit SimulationController(SingleNodeSimulation& simulation) + : sim(simulation) {} + + void configure(const std::vector& depths, const std::vector& expected_first_valid_cycles, std::size_t maxCycles) { + std::lock_guard lock(state_mutex); + if (state != SimulationState::IDLE && state != SimulationState::FINISHED) { + throw std::runtime_error("Cannot configure while simulation is running"); + } + fifo_depths = depths; + current_cycles = 0; + current_samples = 0; + max_cycles = maxCycles; + state = SimulationState::CONFIGURED; + + // Reset simulation first + sim.reset(); + + // Configure FIFO depths AFTER reset + std::size_t num_fifos = sim.getFIFOCount(); + + if (fifo_depths.empty()) { + throw std::runtime_error("FIFO depths not configured"); + } + + // Apply depths: if list is shorter, use last value for remaining FIFOs + for (std::size_t i = 0; i < num_fifos; ++i) { + std::size_t depth_idx = std::min(i, fifo_depths.size() - 1); + sim.setFIFODepth(i, fifo_depths[depth_idx]); + } + + for (std::size_t i = 0; i < expected_first_valid_cycles.size(); ++i) { + std::size_t cycles_idx = std::min(i, expected_first_valid_cycles.size() - 1); + sim.setFIFOCyclesUntilExpectedFirstValid(i, expected_first_valid_cycles[cycles_idx]); + } + } + + void start() { + std::lock_guard lock(state_mutex); + if (state != SimulationState::CONFIGURED) { + throw std::runtime_error("Simulation must be configured before starting"); + } + + state = SimulationState::RUNNING; + + // Start simulation in a separate thread + sim_thread = std::jthread([this](std::stop_token stoken) { + try { + std::cout << "Starting simulation with max cycles: " << max_cycles << std::endl; + + // Run the simulation + bool timeout = sim.runToStableState(stoken, max_cycles); + + if (timeout) { + state = SimulationState::FINISHED; + timeout_occurred = true; + } + + // Update state based on completion + if (!stoken.stop_requested()) { + current_samples.store(sim.getCompletedMaps()); + state = SimulationState::FINISHED; + } + state = SimulationState::FINISHED; + } catch (const std::exception& e) { + std::lock_guard error_lock(state_mutex); + std::cout << "Simulation error: " << e.what() << std::endl; + error_message = e.what(); + state = SimulationState::ERROR; + } + }); + } + + void stop() { + if (sim_thread.joinable()) { + sim_thread.request_stop(); + sim_thread.join(); + } + if (state == SimulationState::RUNNING) { + state = SimulationState::FINISHED; + } + } + + json get_status() const { + json status; + status["status"] = "success"; + + SimulationState current_state = state.load(); + switch (current_state) { + case SimulationState::IDLE: + status["state"] = "idle"; + break; + case SimulationState::CONFIGURED: + status["state"] = "configured"; + break; + case SimulationState::RUNNING: + status["state"] = "running"; + status["cycles"] = sim.getCyclesRun(); + status["samples"] = sim.getCompletedMaps(); + break; + case SimulationState::FINISHED: + status["state"] = "finished"; + status["timeout"] = timeout_occurred; + if (timeout_occurred) { + status["state"] = "timeout"; + } + status["cycles"] = sim.getCyclesRun(); + status["samples"] = sim.getCompletedMaps(); + status["intervals"] = sim.getOStreamStableStateIntervals(); + // Add FIFO depth data + { + auto depths = sim.getFIFODepth(); + json fifo_depth = json::array(); + for (size_t i = 0; i < depths.size(); ++i) { + fifo_depth.push_back(depths[i]); + } + if (!fifo_depth.empty()) { + status["fifo_depth"] = fifo_depth; + } + } + // Add FIFO utilization data + { + auto utilizations = sim.getFIFOUtilization(); + json fifo_util = json::array(); + for (size_t i = 0; i < utilizations.size(); ++i) { + fifo_util.push_back(utilizations[i]); + } + if (!fifo_util.empty()) { + status["fifo_utilization"] = fifo_util; + } + } + // Add FIFO cycles until first valid data + { + auto cycles_until_valid = sim.getFIFOCyclesUntilFirstValid(); + json fifo_cycles = json::array(); + for (size_t i = 0; i < cycles_until_valid.size(); ++i) { + fifo_cycles.push_back(cycles_until_valid[i]); + } + if (!fifo_cycles.empty()) { + status["fifo_cycles_until_first_valid"] = fifo_cycles; + } + } + // Add input/output job sizes + { + json in_job_sizes = json::array(); + for (size_t i = 0; i < InstreamCount; ++i) { + in_job_sizes.push_back(sim.getInputJobSize(i)); + } + status["input_job_size"] = in_job_sizes; + + json out_job_sizes = json::array(); + for (size_t i = 0; i < OutstreamCount; ++i) { + out_job_sizes.push_back(sim.getOutputJobSize(i)); + } + status["output_job_size"] = out_job_sizes; + } + break; + case SimulationState::ERROR: + status["state"] = "error"; + status["message"] = error_message; + break; + } + return status; + } + + ~SimulationController() { stop(); } +}; + +void process_command(const json& request, json& response, SimulationController& controller) { + const std::string command = request["command"]; + const json& payload = request["payload"]; + + try { + if (command == "configure") { + std::vector fifo_depths; + + // std::cout << "Payload: " << payload << std::endl; + + // Handle fifo_depth as either a single value or an array + if (payload.contains("fifo_depth")) { + const auto& depth_value = payload["fifo_depth"]; + if (depth_value.is_array()) { + for (const auto& val : depth_value) { + fifo_depths.push_back(val.get()); + } + } else { + fifo_depths.push_back(depth_value.get()); + } + } else { + fifo_depths.push_back(std::numeric_limits::max()); // Default value + } + + std::vector expected_first_valid_cycles; + if (payload.contains("fifo_first_valid_cycles")) { + const auto& expected_cycles_value = payload["fifo_first_valid_cycles"]; + if (expected_cycles_value.is_array()) { + for (const auto& val : expected_cycles_value) { + expected_first_valid_cycles.push_back(val.get()); + } + } else { + expected_first_valid_cycles.push_back(expected_cycles_value.get()); + } + } + + if (fifo_depths.empty()) { + throw std::runtime_error("FIFO depth list cannot be empty"); + } + + std::size_t max_cycles = std::numeric_limits::max(); + if (payload.contains("max_cycles")) { + max_cycles = payload["max_cycles"].get(); + } + + controller.configure(fifo_depths, expected_first_valid_cycles, max_cycles); + response["status"] = "success"; + response["message"] = "Configuration successful"; + } else if (command == "start") { + controller.start(); + response["status"] = "success"; + response["message"] = "Simulation started"; + } else if (command == "status") { + response = controller.get_status(); + } else if (command == "stop") { + controller.stop(); + response["status"] = "success"; + response["message"] = "Simulation stopped"; + // Include final status with FIFO utilization and depth + json final_status = controller.get_status(); + if (final_status.contains("fifo_utilization")) { + response["fifo_utilization"] = final_status["fifo_utilization"]; + } + if (final_status.contains("fifo_depth")) { + response["fifo_depth"] = final_status["fifo_depth"]; + } + if (final_status.contains("cycles")) { + response["cycles"] = final_status["cycles"]; + } + if (final_status.contains("samples")) { + response["samples"] = final_status["samples"]; + } + if (final_status.contains("intervals")) { + response["intervals"] = final_status["intervals"]; + } + if (final_status.contains("timeout")) { + response["timeout"] = final_status["timeout"]; + } + if (final_status.contains("fifo_cycles_until_first_valid")) { + response["fifo_cycles_until_first_valid"] = final_status["fifo_cycles_until_first_valid"]; + } + if (final_status.contains("input_job_size")) { + response["input_job_size"] = final_status["input_job_size"]; + } + if (final_status.contains("output_job_size")) { + response["output_job_size"] = final_status["output_job_size"]; + } + } else { + response["status"] = "error"; + response["message"] = "Unknown command: " + command; + } + } catch (const std::exception& e) { + response["status"] = "error"; + response["message"] = std::string("Error: ") + e.what(); + } +} + +int main(int argc, const char* argv[]) { + // Parse CLI options + po::options_description desc{"Options"}; + desc.add_options()("socket,s", po::value(), "Unix domain socket path for IPC"); + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + std::cout << "Connected Simulation Node Index: " << RTLSimConfig::NodeIndex << " / " << RTLSimConfig::TotalNodes << std::endl; + + // Check if socket communication is enabled + if (vm.count("socket")) { + const std::string socket_path = vm["socket"].as(); + std::cout << "Initializing socket server at: " << socket_path << std::endl; + std::cout.flush(); + + SocketServer server(socket_path); + if (auto error = server.initialize(); error.has_value()) { + std::cerr << "Failed to initialize socket server: " << *error << std::endl; + std::cerr.flush(); + return 1; + } + + std::cout << "Socket server initialized, waiting for commands..." << std::endl; + std::cout.flush(); + + // Construct simulation + SingleNodeSimulation + sim(RTLSimConfig::kernel_libname, RTLSimConfig::design_libname, "xsim_log_file.txt", "trace_file.txt", RTLSimConfig::istream_descs, RTLSimConfig::ostream_descs, + RTLSimConfig::inputInterfaceNames, RTLSimConfig::outputInterfaceNames, 2); + + // Create simulation controller + SimulationController controller(sim); + + // Command processing loop + while (true) { + auto request = server.receive_message(); + if (!request.has_value()) { + std::cout << "Connection closed or error occurred" << std::endl; + break; + } + + json response; + process_command(*request, response, controller); + server.send_message(response); + + // Exit if stop command received + if ((*request)["command"] == "stop") { + break; + } + } + } else { + throw std::runtime_error("Socket path not provided. Socket communication is required."); + } + + return 0; +} diff --git a/finn_xsi/finn_xsi/adapter.py b/finn_xsi/finn_xsi/adapter.py index 993aaa95c8..859daeed19 100644 --- a/finn_xsi/finn_xsi/adapter.py +++ b/finn_xsi/finn_xsi/adapter.py @@ -78,15 +78,15 @@ def compile_sim_obj(top_module_name, source_list, sim_out_dir, debug=False, beha "floating_point_v7_1_18", "floating_point_v7_1_15", "floating_point_v7_1_19", + "work", ] cmd_xelab = [ "xelab", - "work." + top_module_name, + "work." + "finn_design_wrapper", "-relax", - "-prj", - "rtlsim.prj", "-dll", + "--O3", "-s", top_module_name, ] @@ -105,7 +105,10 @@ def compile_sim_obj(top_module_name, source_list, sim_out_dir, debug=False, beha if locate_glbl() is not None: cmd_xelab.insert(1, "work.glbl") - launch_process_helper(cmd_xelab, cwd=sim_out_dir) + cmd_xvlog = "xvlog --incr --relax -prj rtlsim.prj".split() + + launch_process_helper(cmd_xvlog, cwd=sim_out_dir, print_stdout=False) + launch_process_helper(cmd_xelab, cwd=sim_out_dir, print_stdout=False) out_so_relative_path = "xsim.dir/%s/xsimk.so" % top_module_name out_so_full_path = sim_out_dir + "/" + out_so_relative_path diff --git a/finn_xsi/finn_xsi/cmake/CompilerWarnings.cmake b/finn_xsi/finn_xsi/cmake/CompilerWarnings.cmake new file mode 100644 index 0000000000..a606ab5163 --- /dev/null +++ b/finn_xsi/finn_xsi/cmake/CompilerWarnings.cmake @@ -0,0 +1,115 @@ +# from here: +# +# https://github.com/lefticus/cppbestpractices/blob/master/02-Use_the_Tools_Available.md + +function( + fifosim_set_project_warnings + project_name + WARNINGS_AS_ERRORS + MSVC_WARNINGS + CLANG_WARNINGS + GCC_WARNINGS + CUDA_WARNINGS) + if("${MSVC_WARNINGS}" STREQUAL "") + set(MSVC_WARNINGS + /W4 # Baseline reasonable warnings + /w14242 # 'identifier': conversion from 'type1' to 'type2', possible loss of data + /w14254 # 'operator': conversion from 'type1:field_bits' to 'type2:field_bits', possible loss of data + /w14263 # 'function': member function does not override any base class virtual member function + /w14265 # 'classname': class has virtual functions, but destructor is not virtual instances of this class may not + # be destructed correctly + /w14287 # 'operator': unsigned/negative constant mismatch + /we4289 # nonstandard extension used: 'variable': loop control variable declared in the for-loop is used outside + # the for-loop scope + /w14296 # 'operator': expression is always 'boolean_value' + /w14311 # 'variable': pointer truncation from 'type1' to 'type2' + /w14545 # expression before comma evaluates to a function which is missing an argument list + /w14546 # function call before comma missing argument list + /w14547 # 'operator': operator before comma has no effect; expected operator with side-effect + /w14549 # 'operator': operator before comma has no effect; did you intend 'operator'? + /w14555 # expression has no effect; expected expression with side- effect + /w14619 # pragma warning: there is no warning number 'number' + /w14640 # Enable warning on thread un-safe static member initialization + /w14826 # Conversion from 'type1' to 'type2' is sign-extended. This may cause unexpected runtime behavior. + /w14905 # wide string literal cast to 'LPSTR' + /w14906 # string literal cast to 'LPWSTR' + /w14928 # illegal copy-initialization; more than one user-defined conversion has been implicitly applied + /permissive- # standards conformance mode for MSVC compiler. + ) + endif() + + if("${CLANG_WARNINGS}" STREQUAL "") + set(CLANG_WARNINGS + -Wall + -Wextra # reasonable and standard + -Wshadow # warn the user if a variable declaration shadows one from a parent context + -Wnon-virtual-dtor # warn the user if a class with virtual functions has a non-virtual destructor. This helps + # catch hard to track down memory errors + -Wold-style-cast # warn for c-style casts + -Wcast-align # warn for potential performance problem casts + -Wunused # warn on anything being unused + -Woverloaded-virtual # warn if you overload (not override) a virtual function + -Wpedantic # warn if non-standard C++ is used + -Wconversion # warn on type conversions that may lose data + -Wsign-conversion # warn on sign conversions + -Wnull-dereference # warn if a null dereference is detected + -Wdouble-promotion # warn if float is implicit promoted to double + -Wformat=2 # warn on security issues around functions that format output (ie printf) + -Wimplicit-fallthrough # warn on statements that fallthrough without an explicit annotation + ) + endif() + + if("${GCC_WARNINGS}" STREQUAL "") + set(GCC_WARNINGS + ${CLANG_WARNINGS} + -Wmisleading-indentation # warn if indentation implies blocks where blocks do not exist + -Wduplicated-cond # warn if if / else chain has duplicated conditions + -Wduplicated-branches # warn if if / else branches have duplicated code + -Wlogical-op # warn about logical operations being used where bitwise were probably wanted + -Wuseless-cast # warn if you perform a cast to the same type + ) + endif() + + if("${CUDA_WARNINGS}" STREQUAL "") + set(CUDA_WARNINGS + -Wall + -Wextra + -Wunused + -Wconversion + -Wshadow + # TODO add more Cuda warnings + ) + endif() + + if(WARNINGS_AS_ERRORS) + message(TRACE "Warnings are treated as errors") + list(APPEND CLANG_WARNINGS -Werror) + list(APPEND GCC_WARNINGS -Werror) + list(APPEND MSVC_WARNINGS /WX) + endif() + + if(MSVC) + set(PROJECT_WARNINGS_CXX ${MSVC_WARNINGS}) + elseif(CMAKE_CXX_COMPILER_ID MATCHES ".*Clang") + set(PROJECT_WARNINGS_CXX ${CLANG_WARNINGS}) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(PROJECT_WARNINGS_CXX ${GCC_WARNINGS}) + else() + message(AUTHOR_WARNING "No compiler warnings set for CXX compiler: '${CMAKE_CXX_COMPILER_ID}'") + # TODO support Intel compiler + endif() + + # use the same warning flags for C + set(PROJECT_WARNINGS_C "${PROJECT_WARNINGS_CXX}") + + set(PROJECT_WARNINGS_CUDA "${CUDA_WARNINGS}") + + target_compile_options( + ${project_name} + INTERFACE # C++ warnings + $<$:${PROJECT_WARNINGS_CXX}> + # C warnings + $<$:${PROJECT_WARNINGS_C}> + # Cuda warnings + $<$:${PROJECT_WARNINGS_CUDA}>) +endfunction() diff --git a/finn_xsi/finn_xsi/cmake/InterproceduralOptimization.cmake b/finn_xsi/finn_xsi/cmake/InterproceduralOptimization.cmake new file mode 100644 index 0000000000..c5c513d14a --- /dev/null +++ b/finn_xsi/finn_xsi/cmake/InterproceduralOptimization.cmake @@ -0,0 +1,7 @@ +include(CheckIPOSupported) +check_ipo_supported(RESULT result OUTPUT output) +if(result) +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION ON) +else() +message(SEND_ERROR "IPO is not supported: ${output}") +endif() diff --git a/finn_xsi/finn_xsi/include/AXIS_Control.h b/finn_xsi/finn_xsi/include/AXIS_Control.h new file mode 100644 index 0000000000..c7f9a96f8b --- /dev/null +++ b/finn_xsi/finn_xsi/include/AXIS_Control.h @@ -0,0 +1,91 @@ +#ifndef AXIS_CONTROL +#define AXIS_CONTROL + +#include +#include +#include +#include +#include + +// Fwd declarations +namespace xsi { + class Design; + class Port; +} // namespace xsi +class Clock; + +class AXIS_Control : public CommunicationChannel { + public: + // Constructor/destructor + AXIS_Control(xsi::Design& design, Clock& clock, size_t job_size, const std::string& prefix = "s_axis_"); + AXIS_Control() = default; + virtual ~AXIS_Control() noexcept = default; + + AXIS_Control(AXIS_Control&& other) = default; + AXIS_Control& operator=(AXIS_Control&& other) = default; + + void inititialized_or_throw(); + + // Core functions - immediate writes + virtual void setInputValid(bool value = true, std::stop_token stoken = {}) override; + virtual bool getOutputValid(std::stop_token stoken = {}) noexcept override; + virtual void setOutputReady(bool value = true, std::stop_token stoken = {}) override; + virtual bool getInputReady(std::stop_token stoken = {}) noexcept override; + + // Deferred write functions + std::reference_wrapper setValid(bool value = true); + std::reference_wrapper setReady(bool value = true); + + virtual void writeBack() = 0; + + // Job Size and Transaction Statistics + size_t job_size; + size_t job_txns; // [0:job_size] + size_t total_txns; + size_t first_complete; // First completion timestamp + + // AXI interface prefix + std::string name; + + protected: + const xsi::Design* design; + const Clock* clk; + + xsi::Port* port_vld; + xsi::Port* port_rdy; +}; + +class S_AXIS_Control : public AXIS_Control { + public: + // Constructor/destructor + S_AXIS_Control(xsi::Design& design, Clock& clock, size_t job_size, size_t job_ticks, const std::string& prefix = "s_axis_"); + S_AXIS_Control() = default; + ~S_AXIS_Control() noexcept = default; + + S_AXIS_Control(S_AXIS_Control&& other) = default; + S_AXIS_Control& operator=(S_AXIS_Control&& other) = default; + + void writeBack() override; + + size_t job_ticks; // throttle if job_size < job_ticks + size_t await_iter; // iteration allowing start of next job +}; + +class M_AXIS_Control : public AXIS_Control { + public: + // Constructor/destructor + M_AXIS_Control(xsi::Design& design, Clock& clock, size_t job_size, const std::string& prefix = "m_axis_"); + M_AXIS_Control() = default; + ~M_AXIS_Control() noexcept = default; + + M_AXIS_Control(M_AXIS_Control&& other) = default; + M_AXIS_Control& operator=(M_AXIS_Control&& other) = default; + + void writeBack() override; + + size_t lastComplete = 0; + size_t interval = 0; + StableStateTracker<> stableState; +}; + +#endif /* AXIS_CONTROL */ diff --git a/finn_xsi/finn_xsi/include/AXI_Control.h b/finn_xsi/finn_xsi/include/AXI_Control.h new file mode 100644 index 0000000000..24e0e11237 --- /dev/null +++ b/finn_xsi/finn_xsi/include/AXI_Control.h @@ -0,0 +1,40 @@ +#ifndef AXI_CONTROL +#define AXI_CONTROL + +#include +#include + +// Fwd declarations +namespace xsi { + class Design; + class Port; +} // namespace xsi +class Clock; + +class AXI_Control { + public: + // Constructor/destructor + AXI_Control(xsi::Design& design, Clock& clock, const std::string& axi_prefix = "AXI_Control_0_0_"); + ~AXI_Control() noexcept = default; + + // // Core register access functions + void writeRegister(uint32_t addr, uint32_t data); + uint32_t readRegister(uint32_t addr); + + private: + // AXI interface prefix + std::string prefix; + xsi::Design& design; + Clock& clk; + + // Helper functions for multi-bit signal handling + void writeAddr(const std::string& signal, uint32_t addr); + void writeData(const std::string& signal, uint32_t data); + void writeStrb(const std::string& signal, uint32_t strb); + uint32_t read(const std::string& signal); + void setBool(const std::string& signal); + void clearBool(const std::string& signal); + bool chkBool(const std::string& signal); +}; + +#endif /* AXI_CONTROL */ diff --git a/finn_xsi/finn_xsi/include/Clock.h b/finn_xsi/finn_xsi/include/Clock.h new file mode 100644 index 0000000000..334d69690b --- /dev/null +++ b/finn_xsi/finn_xsi/include/Clock.h @@ -0,0 +1,36 @@ +#ifndef CLOCK +#define CLOCK + +#include + +// Fwd declarations +namespace xsi { + class Design; +} + +class Clock { + xsi::Design& design; + + Clock(Clock const&) = delete; + Clock& operator=(Clock const&) = delete; + Clock(xsi::Design& design); + template + friend class Simulation; + + public: + Clock(Clock&&) noexcept = default; + Clock& operator=(Clock&&) noexcept = default; + ~Clock() noexcept = default; + + std::function clkHigh; + std::function clkLow; + std::function cycle; + + + void toggleClk() noexcept; + + void clockHigh() noexcept; + void clockLow() noexcept; +}; + +#endif /* CLOCK */ diff --git a/finn_xsi/finn_xsi/include/CommunicationChannel.hpp b/finn_xsi/finn_xsi/include/CommunicationChannel.hpp new file mode 100644 index 0000000000..1f94659da9 --- /dev/null +++ b/finn_xsi/finn_xsi/include/CommunicationChannel.hpp @@ -0,0 +1,74 @@ +#ifndef COMMUNICATIONCHANNEL +#define COMMUNICATIONCHANNEL + +#include +#include +#include + +template +concept ChannelInterface = requires(T t, bool b, std::stop_token stoken) { + { t.getOutputValid(stoken) } -> std::same_as; + { t.setInputValid(b, stoken) } -> std::same_as; + { t.getInputReady(stoken) } -> std::same_as; + { t.setOutputReady(b, stoken) } -> std::same_as; +}; + +class CommunicationChannel { + // Function pointers for downstream object methods + bool (*downstreamGetInputReadyFn)(void*, std::stop_token) = nullptr; + void (*downstreamSetInputValidFn)(void*, bool, std::stop_token) = nullptr; + + void* downstreamObj = nullptr; + + protected: + // Derived classes call this to register their own methods + template + void registerSelfAs() { + // This is intentionally empty - we call methods directly on 'this' + // The template just ensures Derived implements ChannelInterface + } + + public: + template + void connectDownstream(Derived& downstreamPartner) { + this->downstreamObj = &downstreamPartner; + + // Store function pointers for calling the DOWNSTREAM object's methods + downstreamGetInputReadyFn = [](void* obj, std::stop_token stoken) -> bool { return static_cast(obj)->getInputReady(stoken); }; + downstreamSetInputValidFn = [](void* obj, bool v, std::stop_token stoken) { static_cast(obj)->setInputValid(v, stoken); }; + } + + // Mark as inline and noexcept for better optimization + inline void exchangeDataDownstream(std::stop_token stoken = {}) noexcept { + // Call methods on THIS object directly (non-virtual, resolved at compile time) + bool valid = this->getOutputValid(stoken); + // Call downstream object's methods via function pointers + downstreamSetInputValidFn(downstreamObj, valid, stoken); + bool ready = downstreamGetInputReadyFn(downstreamObj, stoken); + // Call method on THIS object directly + this->setOutputReady(ready, stoken); + } + + virtual bool getOutputValid([[maybe_unused]] std::stop_token stoken = {}) = 0; + virtual void setInputValid([[maybe_unused]] bool v, [[maybe_unused]] std::stop_token stoken = {}) = 0; + virtual bool getInputReady([[maybe_unused]] std::stop_token stoken = {}) = 0; + virtual void setOutputReady([[maybe_unused]] bool r, [[maybe_unused]] std::stop_token stoken = {}) = 0; + + virtual ~CommunicationChannel() = default; +}; + +// Example usage: +// class LayerA : public CommunicationChannel { +// public: +// bool getOutputValid(std::stop_token stoken = {}) { /* ... */ } +// void setInputValid(bool v, std::stop_token stoken = {}) { /* ... */ } +// bool getInputReady(std::stop_token stoken = {}) { /* ... */ } +// void setOutputReady(bool r, std::stop_token stoken = {}) { /* ... */ } +// }; +// +// LayerA a; +// LayerB b; +// a.connectDownstream(b); +// a.exchangeDataDownstream(); // or with stop_token: a.exchangeDataDownstream(stoken); + +#endif /* COMMUNICATIONCHANNEL */ diff --git a/finn_xsi/finn_xsi/include/Design.h b/finn_xsi/finn_xsi/include/Design.h new file mode 100644 index 0000000000..83a9016c43 --- /dev/null +++ b/finn_xsi/finn_xsi/include/Design.h @@ -0,0 +1,53 @@ +#ifndef DESIGN +#define DESIGN + +#include + +namespace xsi { + + // - non-copyable handle for exposing simulation control. + class Design { + xsi::Kernel _kernel; + + public: + Design(xsi::Kernel& kernel, const std::string& design_lib, const s_xsi_setup_info& setup_info); + Design(xsi::Kernel& kernel, const std::string& design_lib, const char* const log_file = nullptr, const char* const wdb_file = nullptr); + ~Design(); + + private: + Design(Design const&) = delete; + Design& operator=(Design const&) = delete; + + public: + // Move constructor + Design(Design&& other) noexcept; + + // Move assignment operator + Design& operator=(Design&& other) noexcept; + + //----------------------------------------------------------------------- + // Forwarded Access to Open Simulation + + // Simulation Control & Status + public: + void trace_all(); + void run(const XSI_INT64 step); + void restart(); + + int get_status() const noexcept; + const char* get_error_info() const noexcept; + + // Port Access + public: + int num_ports() const noexcept; + + xsi::Port& getPort(const std::string& name); + const xsi::Port& getPort(const std::string& name) const; + + std::span ports() noexcept; + std::span ports() const noexcept; + + }; // class Design +} // namespace xsi + +#endif /* DESIGN */ diff --git a/finn_xsi/finn_xsi/include/FIFO.h b/finn_xsi/finn_xsi/include/FIFO.h new file mode 100644 index 0000000000..9cb09f9970 --- /dev/null +++ b/finn_xsi/finn_xsi/include/FIFO.h @@ -0,0 +1,41 @@ +#ifndef FIFO_H +#define FIFO_H + +#include +#include +#include +#include + +class FIFO : public CommunicationChannel { + uint64_t maxUtil = 0; + uint64_t currentUtil = 0; + uint64_t maxSize = 0; + uint64_t nextUtil = 0; + uint64_t cyclesUntilExpectedFirstValid = std::numeric_limits::max(); + uint64_t initialCyclesUntilExpectedFirstValid = std::numeric_limits::max(); + + public: + FIFO(uint64_t size = std::numeric_limits::max()); + ~FIFO(); + + void update(bool incomingValid, bool incomingReady); + bool toggleClock(); + virtual bool getInputReady(std::stop_token stoken = {}) noexcept override; + virtual bool getOutputValid(std::stop_token stoken = {}) noexcept override; + bool isEmpty() const; + void reset(uint64_t size = std::numeric_limits::max()); + void setCyclesUntilExpectedFirstValid(uint64_t cycles); + uint64_t getCyclesUntilFirstValid() const; + void setMaxSize(const uint64_t size); + uint64_t getMaxSize() const; + uint64_t getSpaceLeft() const; + uint64_t getMaxUtil() const; + void increaseCounter(const uint64_t count); + + // NOTE: User needs to ensure proper ordering. No runtime enforcement of order. + virtual void setInputValid(bool incomingValid, std::stop_token stoken = {}) override; + virtual void setOutputReady(bool incomingReady, std::stop_token stoken = {}) override; + uint64_t size() const; +}; + +#endif /* FIFO_H */ diff --git a/finn_xsi/finn_xsi/include/InterprocessCommunicationChannel.hpp b/finn_xsi/finn_xsi/include/InterprocessCommunicationChannel.hpp new file mode 100644 index 0000000000..e8ba6d85fc --- /dev/null +++ b/finn_xsi/finn_xsi/include/InterprocessCommunicationChannel.hpp @@ -0,0 +1,247 @@ +#ifndef INTERPROCESSCOMMUNICATIONCHANNEL +#define INTERPROCESSCOMMUNICATIONCHANNEL + +#include +#include +#include +#include +#include + +#ifndef CACHE_LINE_SIZE + #ifdef __cpp_lib_hardware_interference_size +constexpr std::size_t CACHE_LINE_SIZE = std::hardware_destructive_interference_size; + #else +constexpr std::size_t CACHE_LINE_SIZE = 64; + #endif +#endif + +namespace bip = boost::interprocess; + +// ===== INTERPROCESS ASYMMETRIC REQUEST-RESPONSE EXCHANGE ===== +// Concepts for constraining methods based on role +template +concept Sender = IsSender; + +constexpr int MAX_SPIN_WAIT = 100; + +template +class InterprocessCommunicationChannel { + private: + // ===== SHARED MEMORY STRUCTURE ===== + struct alignas(CACHE_LINE_SIZE) SharedChannelData { + struct alignas(CACHE_LINE_SIZE) RequestSlot { + Request data; + std::atomic valid; + + RequestSlot() : data(), valid(false) {} + }; + + struct alignas(CACHE_LINE_SIZE) ResponseSlot { + Response data; + std::atomic valid; + + ResponseSlot() : data(), valid(false) {} + }; + + // Double-buffered requests and responses + RequestSlot requests[2]; + ResponseSlot responses[2]; + + alignas(CACHE_LINE_SIZE) std::atomic request_write_idx; + alignas(CACHE_LINE_SIZE) std::atomic request_read_idx; + alignas(CACHE_LINE_SIZE) std::atomic response_write_idx; + alignas(CACHE_LINE_SIZE) std::atomic response_read_idx; + + SharedChannelData() : request_write_idx(0), request_read_idx(0), response_write_idx(0), response_read_idx(0) { + // Verify atomics are lock-free (required for shared memory) + static_assert(std::atomic::is_always_lock_free, "std::atomic must be lock-free for inter-process use"); + static_assert(std::atomic::is_always_lock_free, "std::atomic must be lock-free for inter-process use"); + } + }; + + // ===== PROCESS-LOCAL STATE ===== + SharedChannelData* channel = nullptr; + std::atomic* refCount = nullptr; + const std::string sharedMemoryName; + bip::managed_shared_memory shmem; + + public: + // Default constructor + InterprocessCommunicationChannel() : sharedMemoryName("") {} + + // Constructor with shared memory name + InterprocessCommunicationChannel(const std::string& shmName) : sharedMemoryName(shmName) { + if constexpr (IsSender) { + // Sender creates shared memory + bip::shared_memory_object::remove(sharedMemoryName.c_str()); + shmem = bip::managed_shared_memory(bip::create_only, sharedMemoryName.c_str(), SharedMemorySize); + std::cout << "Created shared memory: " << sharedMemoryName << std::endl; + } else { + // Receiver opens existing shared memory + std::cout << "Waiting to connect to shared memory: " << sharedMemoryName << std::endl; + while (true) { + try { + shmem = bip::managed_shared_memory(bip::open_only, sharedMemoryName.c_str()); + break; + } catch (const bip::interprocess_exception& e) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } + } + std::cout << "Connected to shared memory: " << sharedMemoryName << std::endl; + } + + // Construct or find the reference counter + refCount = shmem.find_or_construct>("refCount")(0); + refCount->fetch_add(1, std::memory_order_acq_rel); + + // Construct the channel data in shared memory + channel = shmem.find_or_construct("ChannelData")(); + + } + + void handshake() { + // Perform handshake to verify communication works + if constexpr (IsSender) { + // Sender: send test request and wait for response + std::cout << "Sending handshake test request for " << sharedMemoryName << std::endl; + Request test_request{}; + Response test_response = send_request(test_request); + std::cout << "Received handshake test response for " << sharedMemoryName << std::endl; + // Communication verified if we got here without hanging + } else { + // Receiver: wait for test request and send response + std::cout << "Waiting for handshake test request for " << sharedMemoryName << std::endl; + Request test_request = receive_request(); + std::cout << "Received handshake test request for " << sharedMemoryName << std::endl; + Response test_response{}; + send_response(test_response); + std::cout << "Sent handshake test response for " << sharedMemoryName << std::endl; + // Communication verified if we got here + } + } + + // Delete copy operations + InterprocessCommunicationChannel(const InterprocessCommunicationChannel&) = delete; + InterprocessCommunicationChannel& operator=(const InterprocessCommunicationChannel&) = delete; + + // Move constructor + InterprocessCommunicationChannel(InterprocessCommunicationChannel&& other) noexcept + : channel(other.channel), refCount(other.refCount), sharedMemoryName(std::move(other.sharedMemoryName)), shmem(std::move(other.shmem)) { + other.channel = nullptr; + other.refCount = nullptr; + } + + // Move assignment operator + InterprocessCommunicationChannel& operator=(InterprocessCommunicationChannel&& other) noexcept { + if (this != &other) { + channel = other.channel; + refCount = other.refCount; + shmem.swap(other.shmem); + const_cast(sharedMemoryName) = std::move(other.sharedMemoryName); + + other.channel = nullptr; + other.refCount = nullptr; + } + return *this; + } + + ~InterprocessCommunicationChannel() { + if (!refCount || !channel) { + return; + } + + channel = nullptr; + refCount = nullptr; + + std::atomic* ref_ptr = shmem.find>("refCount").first; + if (!ref_ptr) { + return; + } + + int remainingRefs = ref_ptr->fetch_sub(1, std::memory_order_acq_rel) - 1; + + if (remainingRefs == 0) { + shmem.destroy("ChannelData"); + shmem.destroy>("refCount"); + shmem = bip::managed_shared_memory(); + bip::shared_memory_object::remove(sharedMemoryName.c_str()); + } + } + + // SENDER SIDE: Send request, wait for response + Response send_request(const Request& req, std::stop_token stoken = {}) + requires Sender + { + // Write request + int write_slot = channel->request_write_idx.load(std::memory_order_acquire) % 2; + channel->requests[write_slot].data = req; + channel->requests[write_slot].valid.store(true, std::memory_order_release); + channel->request_write_idx.fetch_add(1, std::memory_order_release); + + // Wait for response in corresponding slot + int read_slot = channel->response_read_idx.load(std::memory_order_acquire) % 2; + int spin_count = 0; + while (!channel->responses[read_slot].valid.load(std::memory_order_acquire) && !stoken.stop_requested()) { + if (spin_count++ >= MAX_SPIN_WAIT) { + std::this_thread::yield(); + spin_count = 0; + } else { +#if defined(__x86_64__) || defined(_M_X64) + __builtin_ia32_pause(); +#elif defined(__aarch64__) + asm volatile("yield" ::: "memory"); +#endif + } + } + + if (stoken.stop_requested()) { + return Response{}; // Return default-constructed response on cancellation + } + + Response resp = channel->responses[read_slot].data; + channel->responses[read_slot].valid.store(false, std::memory_order_release); + channel->response_read_idx.fetch_add(1, std::memory_order_release); + + return resp; + } + + // RECEIVER SIDE: Wait for request, send response + Request receive_request(std::stop_token stoken = {}) + requires(!Sender) + { + int read_slot = channel->request_read_idx.load(std::memory_order_acquire) % 2; + int spin_count = 0; + + while (!channel->requests[read_slot].valid.load(std::memory_order_acquire) && !stoken.stop_requested()) { + if (spin_count++ >= MAX_SPIN_WAIT) { + std::this_thread::yield(); + spin_count = 0; + } else { +#if defined(__x86_64__) || defined(_M_X64) + __builtin_ia32_pause(); +#elif defined(__aarch64__) + asm volatile("yield" ::: "memory"); +#endif + } + } + + if (stoken.stop_requested()) { + return Request{}; // Return default-constructed request on cancellation + } + + Request req = channel->requests[read_slot].data; + channel->requests[read_slot].valid.store(false, std::memory_order_release); + channel->request_read_idx.fetch_add(1, std::memory_order_release); + + return req; + } + + void send_response(const Response& resp) + requires(!Sender) + { + int write_slot = channel->response_write_idx.load(std::memory_order_acquire) % 2; + channel->responses[write_slot].data = resp; + channel->responses[write_slot].valid.store(true, std::memory_order_release); + channel->response_write_idx.fetch_add(1, std::memory_order_release); + } +}; + +#endif /* INTERPROCESSCOMMUNICATIONCHANNEL */ diff --git a/finn_xsi/finn_xsi/include/InterprocessCommunicationChannelInterface.hpp b/finn_xsi/finn_xsi/include/InterprocessCommunicationChannelInterface.hpp new file mode 100644 index 0000000000..1ae409c0af --- /dev/null +++ b/finn_xsi/finn_xsi/include/InterprocessCommunicationChannelInterface.hpp @@ -0,0 +1,72 @@ +#ifndef INTERPROCESSCOMMUNICATIONCHANNELINTERFACE +#define INTERPROCESSCOMMUNICATIONCHANNELINTERFACE + +#include +#include +#include + +template +class InterprocessCommunicationChannelInterface : public CommunicationChannel { + struct Forward { + bool valid; + }; + + struct Backward { + bool ready; + }; + + InterprocessCommunicationChannel channel; + Backward lastResponse; + + public: + // Default constructor + InterprocessCommunicationChannelInterface() = default; + + // Constructor with shared memory name + explicit InterprocessCommunicationChannelInterface(const std::string& shmName) : channel(shmName), lastResponse{false} {} + + // Delete copy operations + InterprocessCommunicationChannelInterface(const InterprocessCommunicationChannelInterface&) = delete; + InterprocessCommunicationChannelInterface& operator=(const InterprocessCommunicationChannelInterface&) = delete; + + // Move constructor + InterprocessCommunicationChannelInterface(InterprocessCommunicationChannelInterface&& other) noexcept = default; + + // Move assignment operator + InterprocessCommunicationChannelInterface& operator=(InterprocessCommunicationChannelInterface&& other) noexcept = default; + + virtual bool getInputReady([[maybe_unused]] std::stop_token stoken = {}) override { + if constexpr (!IsSender) { + throw std::runtime_error("getInputReady can only be called on sender instances."); + } else { + return lastResponse.ready; + } + + } + virtual bool getOutputValid(std::stop_token stoken = {}) override { + if constexpr (IsSender) { + throw std::runtime_error("getOutputValid can only be called on receiver instances."); + } else { + return channel.receive_request(stoken).valid; + } + } + + virtual void setInputValid(bool incomingValid, std::stop_token stoken = {}) override { + if constexpr (!IsSender) { + throw std::runtime_error("setInputValid can only be called on sender instances."); + } else { + lastResponse = channel.send_request(Forward{incomingValid}, stoken); + } + } + virtual void setOutputReady(bool incomingReady, [[maybe_unused]] std::stop_token stoken = {}) override { + if constexpr (IsSender) { + throw std::runtime_error("setOutputReady can only be called on receiver instances."); + } else { + channel.send_response(Backward{incomingReady}); + } + } + + virtual ~InterprocessCommunicationChannelInterface() = default; +}; + +#endif /* INTERPROCESSCOMMUNICATIONCHANNELINTERFACE */ diff --git a/finn_xsi/finn_xsi/include/IsolatedSimulation.hpp b/finn_xsi/finn_xsi/include/IsolatedSimulation.hpp new file mode 100644 index 0000000000..8b9636efc0 --- /dev/null +++ b/finn_xsi/finn_xsi/include/IsolatedSimulation.hpp @@ -0,0 +1,208 @@ +#include +#include "SocketServer.h" + + +template +class IsolatedSimulation : public Simulation { + enum class LogType {READY, VALID}; + std::string readylogName; + std::string validlogName; + nlohmann::ordered_json readyJson; + nlohmann::ordered_json validJson; + std::vector inJobSizes; + std::vector outJobSizes; + + /** + * For the given streams check which has the largest job size, and return a tuple + * (stream_index, job_size) for that stream. + **/ + std::tuple getLargestTxnsStream(std::vector& jobSizes) { + size_t l = 0; + size_t idx = 0; + for (size_t i = 0; i < jobSizes.size(); i++) { + if (jobSizes[i] > l) { + l = jobSizes[i]; + idx = i; + } + } + return std::make_tuple(idx, l); + } + + class SimState { + public: + bool running; + size_t inputCyclesDone; + size_t inputCyclesTarget; + size_t inputLargestStreamIndex; + size_t outputCyclesDone; + size_t outputCyclesTarget; + size_t outputLargestStreamIndex; + size_t totalCycles; + + SimState(IsolatedSimulation& sim) { + reset(sim); + } + void reset(IsolatedSimulation& sim) { + totalCycles = 0; + inputCyclesDone = 0; + outputCyclesDone = 0; + running = false; + auto largestIn = sim.getLargestTxnsStream(sim.inJobSizes); + auto largestOut = sim.getLargestTxnsStream(sim.outJobSizes); + inputCyclesTarget = std::get<1>(largestIn) * 2; + inputLargestStreamIndex = std::get<0>(largestIn); + outputCyclesTarget = std::get<1>(largestOut) * 2; + outputLargestStreamIndex = std::get<0>(largestOut); + } + inline bool inputCyclesProcessed() { return inputCyclesDone >= inputCyclesTarget; } + inline bool outputCyclesProcessed() { return outputCyclesDone >= outputCyclesTarget; } + inline bool allCyclesProcessed() { return inputCyclesProcessed() && outputCyclesProcessed(); } + inline bool isRunning() { return running; } + void setRunning(bool v) { running = v; } + std::string getCycleStateInput() { return std::to_string(totalCycles) + "," + std::to_string(inputCyclesDone) + "," + std::to_string(inputCyclesTarget); } + std::string getCycleStateOutput() { return std::to_string(totalCycles) + "," + std::to_string(outputCyclesDone) + "," + std::to_string(outputCyclesTarget); } + json getStatus() { + json j; + if (!running && allCyclesProcessed()) { + j["state"] = "done"; + } else { + j["state"] = running ? "running" : "halted"; + } + j["totalCycles"] = totalCycles; + j["inputCyclesDone"] = inputCyclesDone; + j["inputCyclesTarget"] = inputCyclesTarget; + j["outputCyclesDone"] = outputCyclesDone; + j["outputCyclesTarget"] = outputCyclesTarget; + return j; + } + }; + + + /** Log the ready and valid signals to the JSON fields **/ + void logReady() { + nlohmann::ordered_json j; + j["totalCycles"] = simState.totalCycles; + j["inputCyclesDone"] = simState.inputCyclesDone; + j["inputCyclesTarget"] = simState.inputCyclesTarget; + for (S_AXIS_Control& s : this->istreams) { + j[s.name] = s.getInputReady(); + } + readyJson.push_back(j); + } + + void logValid() { + nlohmann::ordered_json j; + j["totalCycles"] = simState.totalCycles; + j["outputCyclesDone"] = simState.outputCyclesDone; + j["outputCyclesTarget"] = simState.outputCyclesTarget; + for (M_AXIS_Control& s : this->ostreams) { + j[s.name] = s.getOutputValid(); + } + validJson.push_back(j); + } + + SimState simState; + + public: + IsolatedSimulation( + const std::string& kernel_lib, + const std::string& design_lib, + const char* xsim_log_file, + const char* trace_file, + std::array _istream_descs, + std::array _ostream_descs + ) : Simulation( + kernel_lib, design_lib, xsim_log_file, trace_file, _istream_descs, _ostream_descs + ), simState(*this), readyJson(json::array()), validJson(json::array()), + readylogName("readylog.txt"), validlogName("validlog.txt") { + // TODO: Clearly split names between connected and isolated sim (ready_log.txt and readylog.txt) + inJobSizes.resize(_istream_descs.size()); + outJobSizes.resize(_ostream_descs.size()); + std::transform( + _istream_descs.begin(), + _istream_descs.end(), + inJobSizes.begin(), + [](StreamDescriptor& s) { return s.job_size; } + ); + std::transform( + _ostream_descs.begin(), + _ostream_descs.end(), + outJobSizes.begin(), + [](StreamDescriptor& s) { return s.job_size; } + ); + } + + /** Write logs to disk **/ + void commitLogsToDisk(bool clearLogs = true) { + std::ofstream r(readylogName, std::ios::trunc); + std::ofstream v(validlogName, std::ios::trunc); + r << std::setw(4) << readyJson; + std::cout << "Writing ready log: " << readyJson.size() << " elements." << std::endl; + v << std::setw(4) << validJson; + std::cout << "Writing valid log: " << validJson.size() << " elements." << std::endl; + r.close(); + v.close(); + if (clearLogs) { + readyJson = json::array(); + validJson = json::array(); + } + } + + json getStatus() { + return simState.getStatus(); + } + + void halt() { + simState.setRunning(false); + } + + void resume() { + simState.setRunning(true); + } + + bool isRunning() { return simState.isRunning(); } + + bool isDone() { + return !simState.isRunning() && simState.allCyclesProcessed(); + } + + /*** + * Simulate a single cycle + ***/ + void simulate(bool restart = false) { + if (restart) { + simState.reset(*this); + simState.setRunning(true); + std::cout << "Sim set to running: " << simState.isRunning() << std::endl; + std::cout << "Target input/output cycles: " << simState.inputCyclesTarget << ", " << simState.outputCyclesTarget << std::endl; + this->clearPorts(); + this->reset(); + for (S_AXIS_Control& s : this->istreams) { + s.setInputValid(true); + } + for (M_AXIS_Control& s : this->ostreams) { + s.setOutputReady(true); + } + } + + if (!simState.isRunning()) { + std::cout << "Simulation not running! Send \"start\" command first." << std::endl; + return; + } + if (!simState.allCyclesProcessed()) { + logValid(); + logReady(); + + if (!simState.inputCyclesProcessed() && this->istreams[simState.inputLargestStreamIndex].getInputReady()) { + ++simState.inputCyclesDone; + } + if (!simState.outputCyclesProcessed() && this->ostreams[simState.outputLargestStreamIndex].getOutputValid()) { + ++simState.outputCyclesDone; + } + this->clk.toggleClk(); + ++simState.totalCycles; + } else { + simState.setRunning(false); + } + } +}; diff --git a/finn_xsi/finn_xsi/include/Kernel.h b/finn_xsi/finn_xsi/include/Kernel.h new file mode 100644 index 0000000000..c7713ea00d --- /dev/null +++ b/finn_xsi/finn_xsi/include/Kernel.h @@ -0,0 +1,133 @@ +#ifndef KERNEL_H_ +#define KERNEL_H_ + +#include + +#include +#include +#include +#include + +#include "xsi.h" + +namespace xsi { + + // Forward declarations + class Design; + class Port; + + class Kernel { + //----------------------------------------------------------------------- + // Dispatch Table for XSI Functions + class Xsi { + //- Statics --------------------- + public: + // Function Indeces + static constexpr unsigned get_value = 0, put_value = 1, get_int_port = 2, get_str_port = 3, + + get_int = 4, get_port_number = 5, + + trace_all = 6, run = 7, restart = 8, get_status = 9, get_error_info = 10, + + close = 11; + + private: + // Function Names & Types + static constexpr unsigned EXTENT = 12; + static char const* const FUNC_NAMES[EXTENT]; + using type_map = std::tuple< + // Port Access + t_fp_xsi_get_value, t_fp_xsi_put_value, t_fp_xsi_get_int_port, t_fp_xsi_get_str_port, + + // Design Inspection + t_fp_xsi_get_int, t_fp_xsi_get_port_number, + + // Simulation Control & Status + t_fp_xsi_trace_all, t_fp_xsi_run, t_fp_xsi_restart, t_fp_xsi_get_status, t_fp_xsi_get_error_info, + + // Closing + t_fp_xsi_close>; + + //- Actual Contents ------------- + private: + xsiHandle _hdl; + void* _func[EXTENT]; + + //- Lifecycle: in-place structure inside Kernel only + public: + Xsi(xsi::SharedLibrary& lib); + ~Xsi() {} + + private: + Xsi(Xsi const&) = delete; + Xsi& operator=(Xsi const&) = delete; + + public: + // Move constructor + Xsi(Xsi&& other) noexcept; // Move assignment operator + Xsi& operator=(Xsi&& other) noexcept; + + //- Handle Update --------------- + public: + void setHandle(xsiHandle hdl) noexcept; + bool hasValidHandle() const noexcept; + + //- XSI Function Invocation ----- + public: + template + auto invoke(Args&&... args) const { + auto const f = decltype(std::get(type_map()))(_func[FID]); + return (*f)(_hdl, std::forward(args)...); + } + + }; // class Xsi + + private: + // Instance State + xsi::SharedLibrary _kernel_lib; // Backing Kernel Library + Xsi _xsi; // XSI Dispatch Table + + // Optional State once a Design in open + xsi::SharedLibrary _design_lib; + std::vector _ports; + + public: + Kernel(const std::string& kernel_lib); + Kernel(Kernel const&) = delete; + Kernel& operator=(Kernel const&) = delete; + + // Move constructor + Kernel(Kernel&& other) noexcept; + // Move assignment operator + Kernel& operator=(Kernel&& other) noexcept; + + ~Kernel(); + + // Interface reserved for forwarded access through open Design + private: + friend Design; + friend Port; + template + auto xsi(Args&&... args) const { + return _xsi.invoke(std::forward(args)...); + } + + // Port Accessors inlined below and public through Design + Port& getPort(const char* const name); + const Port& getPort(const char* const name) const; + std::span ports() noexcept; + std::span ports() const noexcept; + + // Design con- & destruction hooks + void open(const std::string& design_lib, const s_xsi_setup_info& setup_info); + void close() noexcept; + + public: + // Port count accessor for Design class + size_t port_count() const noexcept; + + }; // class Kernel + +} // namespace xsi + +#endif /* KERNEL_H_ */ diff --git a/finn_xsi/finn_xsi/include/Port.h b/finn_xsi/finn_xsi/include/Port.h new file mode 100644 index 0000000000..0b75b0ecfa --- /dev/null +++ b/finn_xsi/finn_xsi/include/Port.h @@ -0,0 +1,64 @@ +#ifndef PORT_H_ +#define PORT_H_ + +#include +#include + +#include "xsi.h" + +namespace xsi { + + class Kernel; // Forward declaration + + // Only exists within controlled environment within Kernel with open Design. + class Port { + Kernel& _kernel; + unsigned const _id; + std::vector buffer; + + private: + friend Kernel; + // Con- and destruction under full control of Kernel + Port(Port const&) = delete; + Port& operator=(Port const&) = delete; + Port(Kernel& kernel, const unsigned id); + + public: + Port(Port&& other) noexcept; + ~Port() noexcept; + + public: + const char* name() const noexcept; + int dir() const noexcept; + unsigned width() const noexcept; + + bool isInput() const noexcept; + bool isOutput() const noexcept; + bool isInout() const noexcept; + + public: + // Buffer Synchronization + Port& read(); + void write_back(); + + // Inspection + bool hasUnknown() const noexcept; + bool isZero() const noexcept; + bool operator[](const unsigned idx) const noexcept; + + bool as_bool() const noexcept; + unsigned as_unsigned() const noexcept; + std::string as_binstr() const; + std::string as_hexstr() const; + + // Manipulation + Port& clear(); + Port& set(const unsigned val); + Port& set_binstr(const std::string& val); + Port& set_hexstr(const std::string& val); + + }; // class Port + +} // namespace xsi + +#endif /* PORT_H_ */ diff --git a/finn_xsi/finn_xsi/include/SharedLibrary.h b/finn_xsi/finn_xsi/include/SharedLibrary.h new file mode 100644 index 0000000000..0f5e768f6c --- /dev/null +++ b/finn_xsi/finn_xsi/include/SharedLibrary.h @@ -0,0 +1,69 @@ +#ifndef SHAREDLIBRARY_H_ +#define SHAREDLIBRARY_H_ + +#include +#include +#include + +#if defined(_WIN32) + #include +#else + #include +#endif + +namespace xsi { + class SharedLibrary { + public: + static char const library_suffix[]; + + private: + using handle_type = +#if defined(_WIN32) + HINSTANCE; +#else + void*; +#endif + + //----------------------------------------------------------------------- + // Instance State + private: + handle_type _lib; + std::string _path; + + //----------------------------------------------------------------------- + // Life Cycle + public: + SharedLibrary(); + SharedLibrary(const std::string& path); + ~SharedLibrary(); + + private: + SharedLibrary(SharedLibrary const&) = delete; + SharedLibrary& operator=(SharedLibrary const&) = delete; + + public: + // Move constructor + SharedLibrary(SharedLibrary&& other) noexcept; + + // Move assignment operator + SharedLibrary& operator=(SharedLibrary&& other) noexcept; + + public: + operator bool() const noexcept; + SharedLibrary& open(const std::string& path); + SharedLibrary& close() noexcept; + + private: + static handle_type load(const std::string& path); + void unload() noexcept; + + //----------------------------------------------------------------------- + // Accessors + public: + const std::string& path() const noexcept; + std::optional getsymbol(const char* const name); + + }; // class SharedLibrary +} // namespace xsi + +#endif /* SHAREDLIBRARY_H_ */ diff --git a/finn_xsi/finn_xsi/include/Simulation.hpp b/finn_xsi/finn_xsi/include/Simulation.hpp new file mode 100644 index 0000000000..64bcc304ea --- /dev/null +++ b/finn_xsi/finn_xsi/include/Simulation.hpp @@ -0,0 +1,392 @@ +#ifndef SIMULATION +#define SIMULATION +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +template +class Simulation { + protected: + std::ofstream readyLog; + std::ofstream validLog; + + public: + xsi::Kernel kernel; + xsi::Design top; + // S_AXIS_Control goes into the simulated layer + std::array istreams; + // M_AXIS_Control comes from the simulated layer + std::array ostreams; + Clock clk; + + + Simulation(const std::string& kernel_lib, const std::string& design_lib, const char* xsim_log_file, const char* trace_file, + std::array _istream_descs, std::array _ostream_descs) + : kernel(kernel_lib), top(kernel, design_lib, xsim_log_file, trace_file), clk(top) { + if (trace_file) { + top.trace_all(); + } + + // Find I/O Streams and initialize their Status + for (size_t i = 0; i < _istream_descs.size(); ++i) { + istreams[i] = S_AXIS_Control{top, clk, std::data(_istream_descs)[i].job_size, std::data(_istream_descs)[i].job_size, std::data(_istream_descs)[i].name}; + } + for (size_t i = 0; i < _ostream_descs.size(); ++i) { + ostreams[i] = M_AXIS_Control{top, clk, std::data(_ostream_descs)[i].job_size, std::data(_ostream_descs)[i].name}; + } + + // Save simulation input output behaviour + if constexpr (LoggingEnabled) { + readyLog.open("ready_log.txt"); + validLog.open("valid_log.txt"); + } + + // Find Global Control & Run Startup Sequence + clearPorts(); + reset(); + } + + template + bool hasValidOutput() { + // static_assert(Index < ostreams.size(), "Cannot request valid status of unknown output stream index"); + return ostreams[Index].is_valid(); + } + + void clearPorts() noexcept { + // Clear all input ports + for (xsi::Port& p : top.ports()) { + if (p.isInput()) { + p.clear().write_back(); + } + } + } + + void reset() noexcept { + xsi::Port& rst_n = top.getPort("ap_rst_n"); + // Reset all Inputs, Wait for Reset Period + rst_n.set(0).write_back(); + for (unsigned i = 0; i < 16; i++) { + clk.toggleClk(); + } + rst_n.set(1).write_back(); + } +}; + +// Small struct used for exange. Will be changed later to more complex data structure. +struct CommData { + bool data; +}; + +// Communication Flow: +// +// valid ┌──────────────────────────────────────┐ valid valid +// SHM ─────────> │ valid valid │ ─────────> FIFO ─────> SHM +// (pred) <───────── istream ─────────> xsim ─────────> ostream <───────── <───── (succ) +// ready │ <───────── <───────── │ ready ready +// │ ready ready │ +// │ (sim) │ +// └──────────────────────────────────────┘ +template +class SingleNodeSimulation : public Simulation { + using ConsumingInterface = InterprocessCommunicationChannel; + using ProducingInterface = InterprocessCommunicationChannel; + std::array fromProducerInterface; + std::array toConsumerInterface; + std::size_t cyclesRun = 0; + std::size_t completedMaps = 0; + std::array fifo; + + /** + * Initialize streams according to nodeindex + */ + void initStreams() { + if constexpr (FirstNode) { // First Node; no predecessor + for (auto&& s : this->istreams) { // Input into sim valid + s.setInputValid(true); + } + } else if constexpr (LastNode) { // Last Node; no successor + for (auto&& s : this->ostreams) { // Output from sim ready + s.setOutputReady(true); + } + } + } + + [[gnu::hot, gnu::always_inline]] bool runSingleCycle(std::stop_token stoken = {}) { + ++cyclesRun; + bool ret = false; + if constexpr (!FirstNode) { + for (std::size_t i = 0; i < IStreamsSize; ++i) { + // Interface SHM <-> sim + bool istreamReady = this->istreams[i].getInputReady(); + bool fifoValid = fromProducerInterface[i].send_request(CommData{istreamReady}, stoken).data; + this->istreams[i].setValid(fifoValid); // deferred + } + } + if constexpr (!LastNode) { + for (std::size_t i = 0; i < OStreamsSize; ++i) { + // Interface sim -valid-> FIFO + this->fifo[i].setInputValid(this->ostreams[i].getOutputValid(), stoken); + // Interface FIFO <-> SHM + this->fifo[i].setOutputReady(toConsumerInterface[i].receive_request(stoken).data, stoken); + + // Toggle FIFO clock + ret |= this->fifo[i].toggleClock(); + bool fifoValid = this->fifo[i].getOutputValid(); + toConsumerInterface[i].send_response(CommData{fifoValid}); + // FIFO -ready-> sim + this->ostreams[i].setReady(this->fifo[i].getInputReady()); + } + } + if constexpr (LastNode) { + for (auto&& stream : this->ostreams) { + if (stream.getOutputValid() && ++stream.job_txns == stream.job_size) { + // Track job completion and intervals + std::size_t lastComplete = stream.lastComplete; + stream.interval = cyclesRun - lastComplete; + stream.lastComplete = cyclesRun; + stream.job_txns = 0; + ++completedMaps; + if (lastComplete != 0) { + // Update stable state tracker + stream.stableState.update(stream.interval); + } + } + } + } + // ── CLOCK HIGH ───────────────────────────────────────────────────────── + this->clk.clockHigh(); // run(1) [gap] → clk=1 → run(1) + + // ── WRITE (clock is high, commit deferred setValid / setReady) ───────── + // + // The deferred values were prepared at the end of the previous cycle's read + // phase (or are defaults for the first cycle). + for (std::size_t i = 0; i < IStreamsSize; ++i) { + this->istreams[i].writeBack(); + } + for (std::size_t i = 0; i < OStreamsSize; ++i) { + this->ostreams[i].writeBack(); + } + + // ── CLOCK LOW ────────────────────────────────────────────────────────── + this->clk.clockLow(); // run(4999) → clk=0 → run(4999) ← sim settles + return ret; + } + + public: + SingleNodeSimulation(const std::string& kernel_lib, const std::string& design_lib, const char* xsim_log_file, const char* trace_file, + std::array _istream_descs, std::array _ostream_descs, + std::array inputInterfaceNames, std::array outputInterfaceNames, + unsigned int initialFIFODepth = 2) + : Simulation(kernel_lib, design_lib, xsim_log_file, trace_file, _istream_descs, _ostream_descs) { + if (!FirstNode && inputInterfaceNames.empty()) { + throw std::runtime_error("Cannot communicate with predecessor because previous node name was not given!"); + } + if (!LastNode && outputInterfaceNames.empty()) { + throw std::runtime_error( + "Cannot communicate with successor because " + "current node name was not given!"); + } + + if constexpr (!LastNode) { + // Create FIFO buffer + for (std::size_t i = 0; i < OStreamsSize; ++i) { + fifo[i] = FIFO(initialFIFODepth); + } + } + + std::cout << "Initialized " << OStreamsSize << " output FIFOs with depth " << initialFIFODepth << std::endl; + + if constexpr (!LastNode) { + // Create consumer facing interfaces + for (std::size_t i = 0; i < OStreamsSize; ++i) { + std::string shmName{outputInterfaceNames[i]}; + toConsumerInterface[i] = std::move(ProducingInterface(shmName)); + } + } + + std::cout << "Initialized " << OStreamsSize << " producing interfaces for successor communication" << std::endl; + + if constexpr (!FirstNode) { + for (std::size_t i = 0; i < IStreamsSize; ++i) { + std::string shmName{inputInterfaceNames[i]}; + fromProducerInterface[i] = std::move(ConsumingInterface(shmName)); + } + } + + std::cout << "Initialized " << IStreamsSize << " consuming interfaces for predecessor communication" << std::endl; + + // Verify communication works + if constexpr (!LastNode) { + for (std::size_t i = 0; i < OStreamsSize; ++i) { + toConsumerInterface[i].handshake(); + } + } + if constexpr (!FirstNode) { + for (std::size_t i = 0; i < IStreamsSize; ++i) { + fromProducerInterface[i].handshake(); + } + } + + this->clk.clockHigh(); + initStreams(); + this->clk.clockLow(); + std::cout << "Finished initializing simulation." << std::endl; + } + + /// Reset simulation (stream and current FIFO depth, as well as cycle counter) + void reset() { + Simulation::reset(); + if constexpr (!LastNode) { + // Reset FIFOs + for (std::size_t i = 0; i < OStreamsSize; ++i) { + fifo[i].reset(); + } + } + } + + [[gnu::hot, gnu::always_inline]] void runFeatureMaps(std::size_t featureMaps, std::stop_token stoken = {}) { + completedMaps = 0; + while (completedMaps < featureMaps && !stoken.stop_requested()) { + runSingleCycle(stoken); + } + } + + [[gnu::hot, gnu::always_inline]] bool runToStableState(std::stop_token stoken = {}, std::size_t max_cycles = std::numeric_limits::max()) { + bool timeout = false; + while (!std::all_of(this->ostreams.begin(), this->ostreams.end(), [](const M_AXIS_Control& stream) { return stream.stableState.is_stable(); }) & !stoken.stop_requested() & + (cyclesRun <= max_cycles) & !timeout) { + timeout |= runSingleCycle(stoken); + timeout |= runSingleCycle(stoken); + timeout |= runSingleCycle(stoken); + timeout |= runSingleCycle(stoken); + } + return timeout || cyclesRun > max_cycles; + } + + /// Get the number of FIFOs + std::size_t getFIFOCount() const noexcept { + if constexpr (LastNode) { + return 0; + } + return OStreamsSize; + } + + /// Set the depth of a specific FIFO + void setFIFODepth(std::size_t index, std::size_t depth) { + if constexpr (LastNode) { + throw std::runtime_error("Cannot set FIFO depth on last node (no FIFOs present)"); + } + if (index >= OStreamsSize) { + auto error = "FIFO index " + std::to_string(index) + " out of range (max: " + std::to_string(OStreamsSize - 1) + ")"; + throw std::out_of_range(error); + } + fifo[index].setMaxSize(depth); + } + + void setFIFOCyclesUntilExpectedFirstValid(std::size_t index, std::size_t cycles) { + if constexpr (LastNode) { + throw std::runtime_error("Cannot set FIFO cycles until expected first valid on last node (no FIFOs present)"); + } + if (index >= OStreamsSize) { + auto error = "FIFO index " + std::to_string(index) + " out of range (max: " + std::to_string(OStreamsSize - 1) + ")"; + throw std::out_of_range(error); + } + fifo[index].setCyclesUntilExpectedFirstValid(cycles); + } + + /// Set the max FIFO depth of all interfaces + void setMaxFIFODepth(std::size_t depth) { + if constexpr (!LastNode) { + for (FIFO& f : fifo) { + f.setMaxSize(depth); + } + } + } + + std::array getFIFODepth() const noexcept { + if constexpr (LastNode) { + return {}; + } + std::array utilizations{}; + for (std::size_t i = 0; i < OStreamsSize; ++i) { + utilizations[i] = fifo[i].getMaxSize(); + } + return utilizations; + } + + std::array getFIFOCyclesUntilFirstValid() const noexcept { + if constexpr (LastNode) { + return {}; + } + std::array cycles{}; + for (std::size_t i = 0; i < OStreamsSize; ++i) { + cycles[i] = fifo[i].getCyclesUntilFirstValid(); + } + return cycles; + } + + /// Get the job size of the specified output stream + std::size_t getOutputJobSize(std::size_t outputIndex = 0) { return this->ostreams[outputIndex].job_size; } + + /// Get the job size of the specified input stream + std::size_t getInputJobSize(std::size_t inputIndex = 0) { return this->istreams[inputIndex].job_size; } + + /// Get the number of cycles the simulation has run + std::size_t getCyclesRun() const noexcept { return cyclesRun; } + + /// Get the number of completed feature maps + std::size_t getCompletedMaps() const noexcept { return completedMaps; } + + /// Get the maximum FIFO utilization for each output stream + std::array getFIFOUtilization() const noexcept { + if constexpr (LastNode) { + return {}; + } + std::array utilizations{}; + for (std::size_t i = 0; i < OStreamsSize; ++i) { + utilizations[i] = fifo[i].getMaxUtil(); + } + return utilizations; + } + + /// Get the current Ostream stable state intervals. + /// Returns the rounded EMA of observed output intervals so that a single noisy + /// measurement at the boundary of stability does not cause _check_performance to + /// report a false positive or negative (raw last interval can differ from the EMA + /// by up to the StableStateTracker stability threshold in either direction). + /// This should not be the case, but its an additional security measure. + std::array getOStreamStableStateIntervals() const noexcept { + std::array intervals{}; + if constexpr (LastNode) { + for (std::size_t i = 0; i < OStreamsSize; ++i) { + const double ema = this->ostreams[i].stableState.get_ema(); + // Fall back to the raw interval when the EMA has never been updated + // (ema == 0.0 means no second job completion has occurred yet). + intervals[i] = (ema > 0.0) ? static_cast(std::round(ema)) : this->ostreams[i].interval; + } + } + return intervals; + } +}; + + +#endif /* SIMULATION */ diff --git a/finn_xsi/finn_xsi/include/SocketServer.h b/finn_xsi/finn_xsi/include/SocketServer.h new file mode 100644 index 0000000000..9d7e597b9c --- /dev/null +++ b/finn_xsi/finn_xsi/include/SocketServer.h @@ -0,0 +1,40 @@ +#ifndef SOCKET_SERVER_H +#define SOCKET_SERVER_H + +#include +#include +#include +#include + +using json = nlohmann::ordered_json; + +class SocketServer { + private: + int server_fd{-1}; + int client_fd{-1}; + std::string socket_path; + + void close_fd(int& fd) noexcept; + + public: + explicit SocketServer(std::string_view path); + ~SocketServer(); + + // Disable copy construction and assignment + SocketServer(const SocketServer&) = delete; + SocketServer& operator=(const SocketServer&) = delete; + + // Enable move semantics + SocketServer(SocketServer&& other) noexcept; + SocketServer& operator=(SocketServer&& other) noexcept; + + // Returns std::nullopt on success, error message on failure + [[nodiscard]] std::optional initialize(); + [[nodiscard]] std::optional receive_message(); + void send_message(const json& message); + void close_connection() noexcept; + + [[nodiscard]] bool is_connected() const noexcept { return client_fd >= 0; } +}; + +#endif // SOCKET_SERVER_H diff --git a/finn_xsi/finn_xsi/include/StableStateTracker.hpp b/finn_xsi/finn_xsi/include/StableStateTracker.hpp new file mode 100644 index 0000000000..e7da06726d --- /dev/null +++ b/finn_xsi/finn_xsi/include/StableStateTracker.hpp @@ -0,0 +1,84 @@ +#ifndef STABLESTATETRACKER +#define STABLESTATETRACKER + +#include +#include + +/** + * Implements an Exponential Moving Average (EMA) tracker with stability detection. + * The tracker updates its EMA with new unsigned integral values and checks for stability + * based on relative changes over consecutive updates. + */ +template + requires (Alpha > 0 && Alpha <= 1) && + (StabilityThreshold > 0 && StabilityThreshold < 1) && + (RequiredStableCount > 0) +class StableStateTracker { +private: + static constexpr double InvAlpha = 1.0 - Alpha; + static constexpr double SquaredStabilityThreshold = StabilityThreshold * StabilityThreshold; + + double ema; + uint8_t stableCount; + +public: + constexpr StableStateTracker() noexcept + : ema{0.0} + , stableCount{0} + { + } + + /** + * Update with new interval value + * Concepts ensure only unsigned integral types are accepted + */ + inline void update(std::unsigned_integral auto value) noexcept { + // First update initializes directly + if (ema == 0.0) [[unlikely]] { + ema = static_cast(value); + stableCount = 0; + return; + } + + const double oldEma = ema; + const double valDouble = static_cast(value); + + // EMA calculation: ema = value + (1-alpha) * (oldEma - value) + ema = valDouble + InvAlpha * (oldEma - valDouble); + + // Stability check: |change|² / oldEma² < threshold² + // Avoids sqrt and abs operations + const double diff = ema - oldEma; + const double squaredRelativeChange = (diff * diff) / (oldEma * oldEma); + + // Branchless increment/reset using arithmetic + const bool is_change_small = squaredRelativeChange < SquaredStabilityThreshold; + stableCount = is_change_small * (stableCount + (stableCount < RequiredStableCount)); + } + + [[nodiscard]] constexpr double get_ema() const noexcept { + return ema; + } + + [[nodiscard]] constexpr bool is_stable() const noexcept { + return stableCount >= RequiredStableCount; + } + + [[nodiscard]] constexpr uint8_t get_stable_count() const noexcept { + return stableCount; + } + + constexpr void reset() noexcept { + ema = 0.0; + stableCount = 0; + } + + // Get compile-time parameters + static consteval double get_alpha() { return Alpha; } + static consteval double get_stability_threshold() { return StabilityThreshold; } + static consteval uint8_t get_required_stable_count() { return RequiredStableCount; } +}; + +#endif /* STABLESTATETRACKER */ diff --git a/finn_xsi/finn_xsi/include/helper.h b/finn_xsi/finn_xsi/include/helper.h new file mode 100644 index 0000000000..aa13c4612a --- /dev/null +++ b/finn_xsi/finn_xsi/include/helper.h @@ -0,0 +1,25 @@ +#ifndef HELPER_H_ +#define HELPER_H_ + +#include +#include +#include + +constexpr std::array XZ10 = {'0', '1', 'Z', 'X'}; +constexpr std::array HEX = {'0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + +struct StreamDescriptor { + std::string name; + std::size_t job_size; + // // Next job can only start this many clock ticks after start of predecessor. + // std::size_t job_ticks; +}; + +#ifdef NDEBUG +[[maybe_unused]] inline void debug([[maybe_unused]] std::string_view s) {} +#else +inline void debug(std::string_view s) { std::cout << "log [DBG] " << s << "\n"; } +#endif + +#endif /* HELPER_H_ */ diff --git a/finn_xsi/finn_xsi/rtlsim_config.hpp.template b/finn_xsi/finn_xsi/rtlsim_config.hpp.template index 3e0b35cc87..6b442ade25 100644 --- a/finn_xsi/finn_xsi/rtlsim_config.hpp.template +++ b/finn_xsi/finn_xsi/rtlsim_config.hpp.template @@ -11,33 +11,50 @@ * prior to compilation. ***************************************************************************/ -struct stream_desc { - char const *name; - size_t job_size; - // Next job can only start this many clock ticks after start of predecessor. - size_t job_ticks; -}; +#include +#include +#include +#include +#include +#include -// sim kernel .so to use (depends on Vivado version) -static char const kernel_libname[] = "@SIMKERNEL_SO@"; +namespace RTLSimConfig { + // Log during simulation. Turned off by default. Might increase runtime if used. + constexpr bool LoggingEnabled = true; + constexpr bool IsInputNode = @IS_INPUT_NODE@; + constexpr bool IsOutputNode = @IS_OUTPUT_NODE@; -// design library .so to use (important to use this relative path here, -// due to how XSI looks for certain files) -static char const design_libname[] = "xsim.dir/@TOP_MODULE_NAME@/xsimk.so"; + /**** General RTLSIM Configuration Parameters ****/ + constexpr std::array inputInterfaceNames { @INPUT_INTERFACE_NAMES@ }; + constexpr std::array outputInterfaceNames { @OUTPUT_INTERFACE_NAMES@ }; -// AXI stream descriptors {stream_name, transactions_per_inference} -// input AXI stream descriptors -static std::initializer_list const istream_descs { @ISTREAM_DESC@ }; + // Which index node this simulation executes + // In a complete design simulation this is 0 + constexpr size_t NodeIndex = @NODE_INDEX@; -// output AXI stream descriptors -static std::initializer_list const ostream_descs { @OSTREAM_DESC@ }; + // Number of total nodes in the simulation (over all processes) + // In a complete design simulation this is 1 + constexpr size_t TotalNodes = @TOTAL_NODES@; -// number of inferences to perform -constexpr unsigned n_inferences = @N_INFERENCES@; + // sim kernel .so to use (depends on Vivado version) + static char const kernel_libname[] = "@SIMKERNEL_SO@"; -// max number of cycles to wait for output activity on any stream before timeout -constexpr unsigned max_iters = @TIMEOUT_CYCLES@; + // design library .so to use (important to use this relative path here, + // due to how XSI looks for certain files) + static char const design_libname[] = "xsim.dir/@TOP_MODULE_NAME@/xsimk.so"; -// filename for trace and debug, if enabled. This needs xelab -debug option too. -static char const *const trace_filename = @TRACE_FILE@; -static char const *const xsim_log_filename = @XSIM_LOG_FILE@; + // AXI stream descriptors {stream_name, transactions_per_inference} + // input AXI stream descriptors + constexpr std::array istream_descs { @ISTREAM_DESC@ }; + + // output AXI stream descriptors + constexpr std::array ostream_descs { @OSTREAM_DESC@ }; + + // max number of cycles to wait for output activity on any stream before timeout + constexpr unsigned max_iters = @TIMEOUT_CYCLES@; + + // filename for trace and debug, if enabled. This needs xelab -debug option too. + static const std::optional trace_filename = @TRACE_FILE@; + static const std::string xsim_log_filename = @XSIM_LOG_FILE@; + +} diff --git a/finn_xsi/finn_xsi/rtlsim_xsi.cpp b/finn_xsi/finn_xsi/rtlsim_xsi.cpp deleted file mode 100644 index d4fe79581d..0000000000 --- a/finn_xsi/finn_xsi/rtlsim_xsi.cpp +++ /dev/null @@ -1,273 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2025, Advanced Micro Devices, Inc. - * All rights reserved. - * - * SPDX-License-Identifier: BSD-3-Clause - * - * @brief Driver harness demo running a FINN IP core. - * @author Yaman Umuroğlu - * @author Thomas B. Preußer - ***************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xsi_finn.hpp" -#include "rtlsim_config.hpp" - -int main(int argc, char *argv[]) { - - // Load Kernel and Design - xsi::Kernel kernel(kernel_libname); - xsi::Design top(kernel, design_libname, xsim_log_filename, trace_filename); - using Port = xsi::Port; - if(trace_filename) { - // TODO make tracing more finer-grain if possible? - top.trace_all(); - } - - // Ultimate Simulation Summary - std::string synopsis; - - { // RTL Simulation - - // Simulation Report Statistics - size_t iters = 0; - size_t timeout = 0; - size_t itodo = istream_descs.size(); - size_t otodo = ostream_descs.size(); - size_t omute = ostream_descs.size(); - - // Find I/O Streams and initialize their Status - struct stream_status { - char const *name; - Port &port_vld; - Port &port_rdy; - - // Job Size and Transaction Statistics - size_t job_size; - size_t job_txns; // [0:job_size] - size_t total_txns; - size_t first_complete; // First completion timestamp - - union { - // Input Stream - struct { - size_t job_ticks; // throttle if job_size < job_ticks - size_t await_iter; // iteration allowing start of next job - }; - // Output Stream - struct { - size_t last_complete; - size_t interval; - }; - }; - - public: - stream_status( - char const *name, Port &port_vld, Port &port_rdy, - size_t job_size, size_t job_ticks - ) : name(name), port_vld(port_vld), port_rdy(port_rdy), job_size(job_size), - job_txns(0), total_txns(0), - first_complete(0), job_ticks(job_ticks), await_iter(job_ticks) {} - }; - std::vector istreams; - std::vector ostreams; - for(auto t : { std::tie(istream_descs, istreams), std::tie(ostream_descs, ostreams) }) { - for(stream_desc const &desc : std::get<0>(t)) { - std::string const name(desc.name); - Port *const vld = top.getPort(name + "_tvalid"); - Port *const rdy = top.getPort(name + "_tready"); - if(!vld || !rdy) { - std::cerr << "Unable to find controls for " << desc.name << std::endl; - return 1; - } - - std::get<1>(t).emplace_back(desc.name, *vld, *rdy, desc.job_size, desc.job_ticks); - } - } - - // Find Global Control & Run Startup Sequence - std::function cycle; - { - Port *const clk = top.getPort("ap_clk"); - Port *const clk2x = top.getPort("ap_clk2x"); - Port *const rst_n = top.getPort("ap_rst_n"); - if(!clk) { - std::cerr << "No clock found on the design." << std::endl; - return 1; - } - cycle = clk2x? - std::function([&top, clk, clk2x](bool const up) mutable { - clk->set(up).write_back(); - clk2x->set(1).write_back(); - top.run(5); - clk2x->set(0).write_back(); - top.run(5); - }) : - std::function([&top, clk](bool const up) mutable { - clk->set(up).write_back(); - top.run(5); - }); - - // Reset all Inputs, Wait for Reset Period - for(Port &p : top.ports()) { if(p.isInput()) p.clear().write_back(); }; - if(rst_n) { - for(unsigned i = 0; i < 16; i++) { cycle(0); cycle(1); } - rst_n->set(1).write_back(); - } - } - - // Start Stream Feed and Capture - std::cout << "Starting data feed with idle-output timeout of " << max_iters << " cycles ...\n" << std::endl; - - // Make all Inputs valid & all Outputs ready - for(auto &s : istreams) s.port_vld.set(1).write_back(); - for(auto &s : ostreams) s.port_rdy.set(1).write_back(); - - // Enter Simulation Loop and track Progress - auto const begin = std::chrono::steady_clock::now(); - std::vector> to_write; - while(true) { - - //------------------------------------------------------------------- - // Clock down - then read signal updates from design - cycle(0); - - // check for transactions on input streams - for(auto &s : istreams) { - bool const vld = s.port_vld[0]; - bool const rdy = s.port_rdy.read()[0]; - if(vld && !rdy) continue; - - // Track successgul Transactions - if(vld) { - s.job_txns++; - if(++s.total_txns == s.job_size * n_inferences) itodo--; - } - - // Proceed according to Throttling Rate - if((s.job_txns < s.job_size) || !(iters < s.await_iter)) { - if(s.total_txns < s.job_size * n_inferences) { - if(!vld) to_write.emplace_back(s.port_vld.set(1)); - if(s.job_txns == s.job_size) { - s.job_txns = 0; - s.await_iter = iters + s.job_ticks; - } - continue; - } - } - if(vld) to_write.emplace_back(s.port_vld.set(0)); - } - - { // check for transactions on the output streams - bool dead = true; - for(auto &s : ostreams) { - if(s.port_rdy[0] && s.port_vld.read()[0]) { - size_t const txns = ++s.total_txns; - if(txns == s.job_size) { - s.first_complete = iters; - omute--; - } - if(++s.job_txns == s.job_size) { - s.interval = iters - s.last_complete; - s.last_complete = iters; - s.job_txns = 0; - } - if(txns >= s.job_size * n_inferences) { - if(txns == s.job_size * n_inferences) otodo--; - else { - std::cerr << "Spurious output on " << s.name << std::endl; - to_write.emplace_back(s.port_rdy.set(0)); - } - } - dead = false; - } - } - timeout = dead? timeout + 1 : 0; - } - - //------------------------------------------------------------------- - // Clock up - then write signal updates back to design - cycle(1); - - // Write back Ports with registered updates - for(Port &p : to_write) p.write_back(); - to_write.clear(); - - // Show a progress message once in a while - if(++iters % 10000 == 0) { - std::cout - << '@' << iters << " ticks / " - << std::chrono::duration_cast(std::chrono::steady_clock::now() - begin).count() << "s:"; - for(auto const &s : istreams) { - std::cout << '\t' << s.name << '=' << ((100 * s.total_txns) / (n_inferences * s.job_size)) << '%'; - } - for(auto const &s : ostreams) { - std::cout << '\t' << s.name << '=' << ((100 * s.total_txns) / (n_inferences * s.job_size)) << '%'; - } - std::cout << "\tMute Outputs: " << omute << std::endl; - } - - // Check for exit - if((timeout > max_iters) || (!itodo && !otodo)) break; - } - - size_t total_in_txns = 0; - for(auto const &s : istreams) total_in_txns += s.total_txns; - - size_t total_out_txns = 0; - size_t firstout_latency = 0; - size_t max_interval = 0; - for(auto const &s : ostreams) { - total_out_txns += s.total_txns; - firstout_latency = std::max(firstout_latency, s.first_complete); - max_interval = std::max(max_interval, s.interval); - } - - std::ostringstream bld; - bld << - "N_IN_TXNS\t" << total_in_txns << "\n" - "N_OUT_TXNS\t" << total_out_txns << "\n" - "cycles\t" << iters << "\n" - "N\t" << n_inferences << "\n" - "latency_cycles\t" << firstout_latency << "\n" - "interval_cycles\t" << max_interval << "\n" - "TIMEOUT\t" << (timeout > max_iters? "1" : "0") << "\n" - "UNFINISHED_INS\t" << itodo << "\n" - "UNFINISHED_OUTS\t" << otodo << "\n" - "RUNTIME_S\t" << std::chrono::duration_cast(std::chrono::steady_clock::now() - begin).count(); - synopsis = bld.str(); - - } // done simulation - - // Dump Simulation Statistics to stdout and results.txt - std::cout << '\n' << synopsis << std::endl; - - { // Log error info to file - std::ofstream error_file("fifosim.err", std::ios::out | std::ios::trunc); - error_file << top.get_error_info(); - } - - { // Synopsis and `max_count` readings to results file - std::ofstream results_file("results.txt", std::ios::out | std::ios::trunc); - results_file << synopsis << std::endl; - for(Port &p : top.ports()) { - if(p.isOutput()) { - char const *const name = p.name(); - if(std::strncmp(name, "maxcount", 8) == 0) { - p.read(); - results_file << name << '\t' << p.as_unsigned() << std::endl; - } - } - } - } - - return 0; -} diff --git a/finn_xsi/finn_xsi/src/AXIS_Control.cpp b/finn_xsi/finn_xsi/src/AXIS_Control.cpp new file mode 100644 index 0000000000..9e91cbaa38 --- /dev/null +++ b/finn_xsi/finn_xsi/src/AXIS_Control.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include + +#include + +std::string sanitize_prefix(const std::string& prefix) { + if (prefix.empty()) { + throw std::invalid_argument("AXI prefix cannot be empty."); + } + std::string sanitized = prefix; + if (sanitized.back() != '_') { + sanitized += "_"; + } + return sanitized; +} + +std::string remove_trailing_underscore(const std::string& prefix) { + std::string clean = prefix; + if (clean.back() == '_') { + // If checks implicitly that pop_back() doesn't have undefined behav. + clean.pop_back(); + } + return clean; +} + +AXIS_Control::AXIS_Control(xsi::Design& des, Clock& clock, size_t job_sz, const std::string& prefix) + : job_size(job_sz), + job_txns(0), + total_txns(0), + first_complete(0), + name(remove_trailing_underscore(sanitize_prefix(prefix))), + design(&des), + clk(&clock), + port_vld(&des.getPort(sanitize_prefix(prefix) + "tvalid")), + port_rdy(&des.getPort(sanitize_prefix(prefix) + "tready")) {} + +void AXIS_Control::inititialized_or_throw() { + if (!design || !clk || !port_rdy || !port_vld) { + throw std::runtime_error("AXIS Control object not correctly initialized! Aborting!"); + } +} + +void AXIS_Control::setInputValid(bool value, [[maybe_unused]] std::stop_token stoken) { port_vld->set(static_cast(value)).write_back(); } + +bool AXIS_Control::getOutputValid([[maybe_unused]] std::stop_token stoken) noexcept { return port_vld->read().as_bool(); } + +void AXIS_Control::setOutputReady(bool value, [[maybe_unused]] std::stop_token stoken) { port_rdy->set(static_cast(value)).write_back(); } +bool AXIS_Control::getInputReady([[maybe_unused]] std::stop_token stoken) noexcept { return port_rdy->read().as_bool(); } + +// Deferred write functions +std::reference_wrapper AXIS_Control::setValid(bool value) { return std::ref(port_vld->set(value ? 1 : 0)); } + +std::reference_wrapper AXIS_Control::setReady(bool value) { return std::ref(port_rdy->set(value ? 1 : 0)); } + +S_AXIS_Control::S_AXIS_Control(xsi::Design& des, Clock& clock, size_t job_sz, size_t job_tks, const std::string& prefix) + : AXIS_Control(des, clock, job_sz, prefix), job_ticks(job_tks), await_iter(job_tks) { + if (job_sz < 1 || job_tks < 1) { + throw std::invalid_argument("Job size and ticks must be greater than 0."); + } +} + +void S_AXIS_Control::writeBack() { + this->port_vld->write_back(); +} + +M_AXIS_Control::M_AXIS_Control(xsi::Design& des, Clock& clock, size_t job_sz, const std::string& prefix) : AXIS_Control(des, clock, job_sz, prefix), lastComplete(0), interval(0) { + if (job_sz < 1) { + throw std::invalid_argument("Job size must be greater than 0."); + } +} + +void M_AXIS_Control::writeBack() { + this->port_rdy->write_back(); +} diff --git a/finn_xsi/finn_xsi/src/AXI_Control.cpp b/finn_xsi/finn_xsi/src/AXI_Control.cpp new file mode 100644 index 0000000000..fa3c8b6f35 --- /dev/null +++ b/finn_xsi/finn_xsi/src/AXI_Control.cpp @@ -0,0 +1,188 @@ +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace xsi; + +// Constructor +AXI_Control::AXI_Control(xsi::Design& des, Clock& clock, const std::string& axi_prefix) : prefix(axi_prefix), design(des), clk(clock) { + // Check if the prefix is valid + if (prefix.empty()) { + throw std::invalid_argument("AXI prefix cannot be empty."); + } + + // Ensure the prefix ends with an underscore + if (prefix.back() != '_') { + prefix += "_"; + } +} + +// Helper functions for multi-bit signal handling +void AXI_Control::writeAddr(const std::string& signal, uint32_t addr) { + // Convert addr to binary string + std::string addr_bin = std::bitset<32>(addr).to_string(); + + // Remove leading zeros to get the actual size used in the simulation + addr_bin.erase(0, addr_bin.find_first_not_of('0')); + + + // Get port size + Port& port = design.getPort(signal); + auto n_bits = port.width(); + + // Ensure the string is the right length + if (addr_bin.length() < n_bits) { + addr_bin = std::string(n_bits - addr_bin.length(), '0') + addr_bin; + } else if (addr_bin.length() > n_bits) { + addr_bin = addr_bin.substr(addr_bin.length() - n_bits); + } + + port.set_binstr(addr_bin).write_back(); +} + +void AXI_Control::writeData(const std::string& signal, uint32_t data) { + // Similar to write_addr + std::string data_bin = std::bitset<32>(data).to_string(); + + // Get port size + Port& port = design.getPort(signal); + auto n_bits = port.width(); + + if (data_bin.length() < n_bits) { + data_bin = std::string(n_bits - data_bin.length(), '0') + data_bin; + } else if (data_bin.length() > n_bits) { + data_bin = data_bin.substr(data_bin.length() - n_bits); + } + + port.set_binstr(data_bin).write_back(); +} + +void AXI_Control::writeStrb(const std::string& signal, uint32_t strb) { + // Similar to write_addr + std::string strb_bin = std::bitset<4>(strb).to_string(); + + // Get port size + Port& port = design.getPort(signal); + auto n_bits = port.width(); + + if (strb_bin.length() < n_bits) { + strb_bin = std::string(n_bits - strb_bin.length(), '0') + strb_bin; + } else if (strb_bin.length() > n_bits) { + strb_bin = strb_bin.substr(strb_bin.length() - n_bits); + } + + port.set_binstr(strb_bin).write_back(); +} + +uint32_t AXI_Control::read(const std::string& signal) { + Port& port = design.getPort(signal); + return port.read().as_unsigned(); +} + +void AXI_Control::setBool(const std::string& signal) { + Port& port = design.getPort(signal); + port.set(1).write_back(); +} + +void AXI_Control::clearBool(const std::string& signal) { + Port& port = design.getPort(signal); + port.set(0).write_back(); +} + +bool AXI_Control::chkBool(const std::string& signal) { + Port& port = design.getPort(signal); + return port.read().as_bool(); +} + +void AXI_Control::writeRegister(uint32_t addr, uint32_t data) { + // Assert BREADY to receive response + setBool(prefix + "bready"); + // Set address + writeAddr(prefix + "awaddr", addr); + // Set data and strobe (full 32-bit word) + writeData(prefix + "wdata", data); + writeStrb(prefix + "wstrb", 0xF); // All bytes enabled + + // Assert AWVALID + setBool(prefix + "awvalid"); + + // Assert WVALID + setBool(prefix + "wvalid"); + + // Wait for AWREADY + while (!chkBool(prefix + "awready")) { + clk.toggleClk(); + } + + // Wait for WREADY + while (!chkBool(prefix + "wready")) { + clk.toggleClk(); + } + + clk.toggleClk(); // Make sure that for at least one cycle the signals were set + + // Deassert AWVALID and WVALID + clearBool(prefix + "awvalid"); + clearBool(prefix + "wvalid"); + + + // Wait for BVALID + while (!chkBool(prefix + "bvalid")) { + clk.toggleClk(); + } + + // Check BRESP (optional, could add error handling) + uint32_t bresp = read(prefix + "bresp"); + if (bresp != 0) { + std::cerr << "AXI write error: BRESP = " << bresp << std::endl; + } + + // Deassert BREADY + clearBool(prefix + "bready"); + + clk.toggleClk(); +} + +uint32_t AXI_Control::readRegister(uint32_t addr) { + // Assert RREADY to receive data + setBool(prefix + "rready"); + // Set address + writeAddr(prefix + "araddr", addr); + + // Assert ARVALID + setBool(prefix + "arvalid"); + + // Wait for ARREADY + while (!chkBool(prefix + "arready")) { + clk.toggleClk(); + } + + // Wait for RVALID + while (!chkBool(prefix + "rvalid")) { + clk.toggleClk(); + } + + // Deassert ARVALID + clearBool(prefix + "arvalid"); + + // Read data + uint32_t data = read(prefix + "rdata"); + + // Check RRESP (optional, could add error handling) + uint32_t rresp = read(prefix + "rresp"); + if (rresp != 0) { + std::cerr << "AXI read error: RRESP = " << rresp << std::endl; + } + + // Deassert RREADY + clearBool(prefix + "rready"); + clk.toggleClk(); + + return data; +} diff --git a/finn_xsi/finn_xsi/src/Clock.cpp b/finn_xsi/finn_xsi/src/Clock.cpp new file mode 100644 index 0000000000..1b5e329e93 --- /dev/null +++ b/finn_xsi/finn_xsi/src/Clock.cpp @@ -0,0 +1,68 @@ +#include +#include +#include + +using namespace xsi; + +Clock::Clock(xsi::Design& des) : design(des) { + // Find Global Control & Run Startup Sequence + Port& clk = des.getPort("ap_clk"); + auto ports = des.ports(); + + Port* clk2x = nullptr; + for (auto&& p : ports) { + if (p.name() == std::string("ap_clk2x")) { + clk2x = &p; + break; + } + } + clkHigh = clk2x ? std::function([&des, &clk, clk2x]() mutable { + des.run(1); + clk.set(1).write_back(); + clk2x->set(1).write_back(); + des.run(1); + }) : std::function([&des, &clk]() mutable { + des.run(1); + clk.set(1).write_back(); + des.run(1); + }); + clkLow = clk2x ? std::function([&des, &clk, clk2x]() mutable { + des.run(2499); + clk2x->set(0).write_back(); + des.run(2500); + clk.set(0).write_back(); + clk2x->set(1).write_back(); + des.run(2500); + clk2x->set(0).write_back(); + des.run(2499); + + }) : std::function([&des, &clk]() mutable { + des.run(4999); + clk.set(0).write_back(); + des.run(4999); + }); + // cycle = clk2x ? std::function([&des, &clk, clk2x](bool const up) mutable { + // clk.set(up).write_back(); + // clk2x->set(1).write_back(); + // des.run(5000); + // clk2x->set(0).write_back(); + // des.run(5000); + // }) + // : std::function([&des, &clk](bool const up) mutable { + // clk.set(up).write_back(); + // des.run(5000); + // }); +} + +void Clock::toggleClk() noexcept { + clkHigh(); + clkLow(); +} + +void Clock::clockHigh() noexcept { + clkHigh(); +} + +void Clock::clockLow() noexcept { + clkLow(); +} diff --git a/finn_xsi/finn_xsi/src/Design.cpp b/finn_xsi/finn_xsi/src/Design.cpp new file mode 100644 index 0000000000..fcd85738eb --- /dev/null +++ b/finn_xsi/finn_xsi/src/Design.cpp @@ -0,0 +1,51 @@ +#include + +using namespace xsi; + +// Constructors +Design::Design(xsi::Kernel& kernel, const std::string& design_lib, const s_xsi_setup_info& setup_info) : _kernel(std::move(kernel)) { _kernel.open(design_lib, setup_info); } + +Design::Design(xsi::Kernel& kernel, const std::string& design_lib, const char* const log_file, const char* const wdb_file) + : Design(kernel, design_lib, s_xsi_setup_info{.logFileName = const_cast(log_file), .wdbFileName = const_cast(wdb_file)}) {} + +// Destructor +Design::~Design() { _kernel.close(); } + +// Move constructor +Design::Design(Design&& other) noexcept : _kernel(std::move(other._kernel)) { + // The kernel now manages the moved design + // No additional work needed as the kernel handles the XSI state +} + +// Move assignment operator +Design& Design::operator=(Design&& other) noexcept { + if (this != &other) { + _kernel.close(); // Close current design + // Note: _kernel is a reference and cannot be reassigned + // The move semantics here are limited since we hold a reference + _kernel = std::move(other._kernel); + } + return *this; +} + +// Simulation Control & Status +void Design::trace_all() { _kernel.xsi(); } + +void Design::run(const XSI_INT64 step) { _kernel.xsi(step); } + +void Design::restart() { _kernel.xsi(); } + +int Design::get_status() const noexcept { return _kernel.xsi(); } + +const char* Design::get_error_info() const noexcept { return _kernel.xsi(); } + +// Port Access +int Design::num_ports() const noexcept { return static_cast(_kernel.port_count()); } + +xsi::Port& Design::getPort(const std::string& name) { return _kernel.getPort(name.c_str()); } + +const xsi::Port& Design::getPort(const std::string& name) const { return _kernel.getPort(name.c_str()); } + +std::span Design::ports() noexcept { return _kernel.ports(); } + +std::span Design::ports() const noexcept { return _kernel.ports(); } diff --git a/finn_xsi/finn_xsi/src/FIFO.cpp b/finn_xsi/finn_xsi/src/FIFO.cpp new file mode 100644 index 0000000000..9cfa53aacc --- /dev/null +++ b/finn_xsi/finn_xsi/src/FIFO.cpp @@ -0,0 +1,104 @@ +#include + +#include +#include +#include + +FIFO::FIFO(uint64_t size) : maxSize(size) {} +FIFO::~FIFO() {} + +/// Prepare update for the next clock cycle. +/// This models Q_srl behavior where: +/// - When empty: only accepts input (ignores output ready), transitions to size 1 +/// - When non-empty: can consume, produce, or both +/// With bounded maxSize, this models a real FIFO with backpressure. +void FIFO::update(bool incomingValid, bool incomingReady) { + // When empty: only push if valid (ignoring ready) + // When non-empty: push if valid AND space available + uint64_t canPush = incomingValid & (currentUtil < maxSize); + + // Q_srl behavior: when empty, only check input valid (ignore output ready) + // Only pop if was non-empty at start AND output ready + uint64_t canPop = incomingReady & (currentUtil != 0); + + nextUtil = nextUtil + canPush - canPop; +} + +/// Toggle the clock cycle, and update the previously set values. +/// nextUtil is guaranteed to be in [0, maxSize] by all operations. +/// Returns false if a first valid signal was expected, but has not been observed. +bool FIFO::toggleClock() { + currentUtil = nextUtil; + maxUtil = std::max(maxUtil, currentUtil); + nextUtil = currentUtil; + cyclesUntilExpectedFirstValid -= static_cast(static_cast(cyclesUntilExpectedFirstValid) & !static_cast(maxUtil)); // Underflow-safe decrement + return (cyclesUntilExpectedFirstValid == 0) & (maxUtil == 0); +} + +/// Return whether the FIFO can accept inputs (for the current utilization) +/// Uses nextUtil (post-push state) so that ready correctly reflects capacity +/// after any push already committed this cycle, preventing AXI-S violations. +bool FIFO::getInputReady([[maybe_unused]] std::stop_token stoken) noexcept { return nextUtil < maxSize; } + +/// Return whether the FIFO can output values (for the current utilization) +bool FIFO::getOutputValid([[maybe_unused]] std::stop_token stoken) noexcept { return currentUtil > 0; } + +/// Return whether the FIFO is empty (for the current utilization) +bool FIFO::isEmpty() const { return currentUtil == 0; } + +/// Reset the FIFOs internal state. If size is given, also set maxSize, +/// otherwise keep it. +void FIFO::reset(uint64_t size) { + currentUtil = 0; + maxUtil = 0; + maxSize = size; + nextUtil = 0; + cyclesUntilExpectedFirstValid = std::numeric_limits::max(); +} + +void FIFO::setCyclesUntilExpectedFirstValid(uint64_t cycles) { + cyclesUntilExpectedFirstValid = cycles; + initialCyclesUntilExpectedFirstValid = cycles; + std::cout << "FIFO set to expect first valid after " << cycles << " cycles" << std::endl; +} + +uint64_t FIFO::getCyclesUntilFirstValid() const { return initialCyclesUntilExpectedFirstValid - cyclesUntilExpectedFirstValid; } + +/// Set the FIFOs max size +void FIFO::setMaxSize(const uint64_t size) { maxSize = size; } + +uint64_t FIFO::getMaxSize() const { return maxSize; } + +uint64_t FIFO::getSpaceLeft() const { return maxSize - currentUtil; } + +uint64_t FIFO::getMaxUtil() const { return maxUtil; } + +void FIFO::increaseCounter(const uint64_t count) { + // Branchless: compute new value and saturate at maxSize + uint64_t newUtil = nextUtil + count; + uint64_t overflow = newUtil > maxSize; + nextUtil = overflow ? maxSize : newUtil; +} + +/// If incomingValid is true and FIFO has space, increment nextUtil +/// Matches Q_srl: when empty, always accepts input +/// When using tryPush/tryPop separately, ALWAYS call tryPush BEFORE tryPop! +void FIFO::setInputValid(bool incomingValid, [[maybe_unused]] std::stop_token stoken) { + // When empty: accept input unconditionally (like Q_srl state_empty) + // When non-empty: accept if space available + nextUtil += incomingValid & (nextUtil < maxSize); +} + +/// If incomingReady is true and FIFO has data, decrement nextUtil +/// Matches Q_srl: only pops if data available +/// When using tryPush/tryPop separately, ALWAYS call tryPush BEFORE tryPop! +/// Note: If FIFO was empty and tryPush just added data, tryPop will NOT pop it +/// (matching Q_srl where state_empty ignores output ready) +void FIFO::setOutputReady(bool incomingReady, [[maybe_unused]] std::stop_token stoken) { + // Check currentUtil (state at cycle start) not nextUtil (after tryPush) + // This ensures empty->tryPush->tryPop results in size=1, matching Q_srl + nextUtil -= incomingReady & (currentUtil > 0); +} + +/// Return the current number of elements in the FIFO +uint64_t FIFO::size() const { return currentUtil; } diff --git a/finn_xsi/finn_xsi/src/Kernel.cpp b/finn_xsi/finn_xsi/src/Kernel.cpp new file mode 100644 index 0000000000..8b5b16657a --- /dev/null +++ b/finn_xsi/finn_xsi/src/Kernel.cpp @@ -0,0 +1,168 @@ +#include +#include +#include + +#include +#include +#include + +using namespace xsi; + +void* resolve_or_throw(xsi::SharedLibrary& lib, char const* const sym) { + auto const res = lib.getsymbol(sym); + if (!res) { + throw std::runtime_error(std::string("Failed to resolve ").append(sym).append(" in ").append(lib.path())); + } + return *res; +} + +char const* const Kernel::Xsi::FUNC_NAMES[EXTENT] = {"xsi_get_value", "xsi_put_value", + "xsi_get_int_port", "xsi_get_str_port", + + "xsi_get_int", "xsi_get_port_number", + + "xsi_trace_all", "xsi_run", + "xsi_restart", "xsi_get_status", + "xsi_get_error_info", + + "xsi_close"}; + + +Kernel::Xsi::Xsi(xsi::SharedLibrary& lib) : _hdl(nullptr) { + // Resolve XSI Functions + for (unsigned i = 0; i < EXTENT; i++) { + _func[i] = resolve_or_throw(lib, FUNC_NAMES[i]); + } +} + +// Xsi Move constructor +Kernel::Xsi::Xsi(Xsi&& other) noexcept : _hdl(other._hdl) { + std::copy(std::begin(other._func), std::end(other._func), std::begin(_func)); + other._hdl = nullptr; + std::fill(std::begin(other._func), std::end(other._func), nullptr); +} + +// Xsi Move assignment operator +Kernel::Xsi& Kernel::Xsi::operator=(Xsi&& other) noexcept { + if (this != &other) { + _hdl = other._hdl; + std::copy(std::begin(other._func), std::end(other._func), std::begin(_func)); + other._hdl = nullptr; + std::fill(std::begin(other._func), std::end(other._func), nullptr); + } + return *this; +} + +// Xsi Handle management +void Kernel::Xsi::setHandle(xsiHandle hdl) noexcept { _hdl = hdl; } + +bool Kernel::Xsi::hasValidHandle() const noexcept { return _hdl != nullptr; } +//--------------------------------------------------------------------------- +// Life Cycle + +// Move constructor +Kernel::Kernel(Kernel&& other) noexcept : _kernel_lib(std::move(other._kernel_lib)), _xsi(std::move(other._xsi)), _design_lib(std::move(other._design_lib)), _ports() { + // Reset source + other._ports.clear(); + + // Recreate ports if design is open + if (_design_lib && _xsi.hasValidHandle()) { + // Enumerate Ports + unsigned const port_count = static_cast(xsi(xsiNumTopPorts)); + _ports.reserve(port_count); + for (unsigned i = 0; i < port_count; ++i) { + _ports.emplace_back(Port(*this, i)); + } + } +} + +// Move assignment operator +Kernel& Kernel::operator=(Kernel&& other) noexcept { + if (this != &other) { + // Clean up current state + close(); + + // Move from other + _kernel_lib = std::move(other._kernel_lib); + _xsi = std::move(other._xsi); + _design_lib = std::move(other._design_lib); + + // Reset ports in source + other._ports.clear(); + + // Recreate ports if design is open + if (_design_lib && _xsi.hasValidHandle()) { + // Enumerate Ports + unsigned const port_count = static_cast(xsi(xsiNumTopPorts)); + _ports.reserve(port_count); + for (unsigned i = 0; i < port_count; i++) { + _ports.emplace_back(Port(*this, i)); + } + } + } + return *this; +} + +Kernel::Kernel(const std::string& kernel_lib) : _kernel_lib(kernel_lib), _xsi(_kernel_lib) {} + +Kernel::~Kernel() { + if (_design_lib) + std::cerr << "Disposing XSI Kernel with open Design." << std::endl; +} + +void Kernel::open(const std::string& design_lib, const s_xsi_setup_info& setup_info) { + _design_lib.open(design_lib); + try { + auto const f = t_fp_xsi_open(resolve_or_throw(_design_lib, "xsi_open")); + xsiHandle const hdl = f(const_cast(&setup_info)); + if (!hdl) + throw std::runtime_error("Loading of design failed"); + _xsi.setHandle(hdl); + + // Enumerate Ports + unsigned const port_count = static_cast(xsi(xsiNumTopPorts)); + _ports.reserve(port_count); + for (unsigned i = 0; i < port_count; i++) { + _ports.emplace_back(Port(*this, i)); + } + } catch (...) { + std::cerr << "Exception during design open, closing design library." << std::endl; + _design_lib.close(); + throw; + } +} +void Kernel::close() noexcept { + xsi(); + _xsi.setHandle(nullptr); + _design_lib.close(); + + // Clear ports - unique_ptr will handle destruction automatically + _ports.clear(); + + // Clean up Library State + std::optional vptr = _kernel_lib.getsymbol("svTypeInfo"); + if (vptr) + *vptr = nullptr; +} + +Port& Kernel::getPort(const char* const name) { + int const id = xsi(name); + + if (id == -1 || id >= static_cast(_ports.size())) { + throw std::runtime_error(std::string("Port not found: ").append(name)); + } + return _ports[static_cast(id)]; +} +const Port& Kernel::getPort(const char* const name) const { + int const id = xsi(name); + + if (id == -1 || id >= static_cast(_ports.size())) { + throw std::runtime_error(std::string("Port not found: ").append(name)); + } + return _ports[static_cast(id)]; +} +std::span Kernel::ports() noexcept { return std::span(_ports.data(), _ports.data() + _ports.size()); } +std::span Kernel::ports() const noexcept { return std::span(_ports.data(), _ports.data() + _ports.size()); } + +// Port count accessor for Design class +size_t Kernel::port_count() const noexcept { return _ports.size(); } diff --git a/finn_xsi/finn_xsi/src/Port.cpp b/finn_xsi/finn_xsi/src/Port.cpp new file mode 100644 index 0000000000..436c2f4778 --- /dev/null +++ b/finn_xsi/finn_xsi/src/Port.cpp @@ -0,0 +1,208 @@ +#include +#include +#include + +using namespace xsi; + +Port::Port(Kernel& kernel, const unsigned id) : _kernel(kernel), _id(id), buffer((width() + 31) / 32) {} + +Port::Port(Port&& other) noexcept : _kernel(other._kernel), _id(other._id), buffer(std::move(other.buffer)) { + // Note: _kernel and _id are reference and const respectively, so they're initialized from other + // The buffer is moved from the other object +} + +Port::~Port() noexcept {} + +bool Port::hasUnknown() const noexcept { + for (auto&& elem : buffer) { + if (elem.bVal) + return true; + } + return false; +} + +bool Port::isZero() const noexcept { + for (auto&& elem : buffer) { + if (elem.aVal) + return false; + } + return true; +} + +std::string Port::as_binstr() const { + unsigned const w = width(); + std::string res(w, '?'); + + auto buffer_iter = buffer.cbegin(); + auto res_iter = res.rbegin(); // Use reverse iterator to fill from right to left + + uint32_t a = 0; + uint32_t b = 0; + for (unsigned i = 0; i < w; i++) { + if ((i & 31) == 0) { + a = buffer_iter->aVal; + b = buffer_iter->bVal; + ++buffer_iter; + } + *res_iter++ = XZ10[((b & 1) << 1) | (a & 1)]; + a >>= 1; + b >>= 1; + } + + return res; +} + +std::string Port::as_hexstr() const { + unsigned l = (width() + 3) / 4; + std::string res(l, '?'); + auto buffer_iter = buffer.cbegin(); + auto res_iter = res.rbegin(); // Use reverse iterator to fill from right to left + + while (l > 0) { + uint32_t a = buffer_iter->aVal; + uint32_t b = buffer_iter->bVal; + ++buffer_iter; + + unsigned m = std::min(8u, l); + l -= m; + for (unsigned i = 0; i < m; ++i) { + unsigned const bm = b & 0xF; + unsigned const am = a & 0xF; + + *res_iter++ = !bm ? HEX[am] : XZ10[3 - !(am & bm)]; + a >>= 4; + b >>= 4; + } + } + return res; +} + +Port& Port::clear() { + std::fill(buffer.begin(), buffer.end(), s_xsi_vlog_logicval{.aVal = 0u, .bVal = 0u}); + return *this; +} + +const char* Port::name() const noexcept { return _kernel.xsi(static_cast(_id), xsiNameTopPort); } + +int Port::dir() const noexcept { return _kernel.xsi(static_cast(_id), xsiDirectionTopPort); } + +unsigned Port::width() const noexcept { return static_cast(_kernel.xsi(static_cast(_id), xsiHDLValueSize)); } + +bool Port::isInput() const noexcept { return dir() == xsiInputPort; } + +bool Port::isOutput() const noexcept { return dir() == xsiOutputPort; } + +bool Port::isInout() const noexcept { return dir() == xsiInoutPort; } + +Port& Port::read() { + _kernel.xsi(static_cast(_id), buffer.data()); + return *this; +} + +void Port::write_back() { _kernel.xsi(static_cast(_id), buffer.data()); } + +bool Port::operator[](const unsigned idx) const noexcept { return (buffer[idx / 32].aVal >> (idx % 32)) & 1; } + +bool Port::as_bool() const noexcept { return buffer[0].aVal & 1; } + +unsigned Port::as_unsigned() const noexcept { return buffer[0].aVal; } + +Port& Port::set(const unsigned val) { + s_xsi_vlog_logicval* const p = buffer.data(); + p->aVal = val; + p->bVal = 0; + return *this; +} + +Port& Port::set_binstr(const std::string& val) { + auto val_iter = val.crbegin(); // Process from right to left + + size_t chars_processed = 0; + const size_t val_length = val.length(); + + for (auto& elem : buffer) { + uint32_t a = 0; + uint32_t b = 0; + + // Process up to 32 characters for this buffer element + const size_t chars_to_process = std::min(32UL, val_length - chars_processed); + + for (size_t j = 0; j < chars_to_process; ++j) { + a <<= 1; + b <<= 1; + + if (val_iter != val.crend()) { + switch (*val_iter++) { + case '1': + a |= 1; + [[fallthrough]]; + case '0': + break; + default: + a |= 1; + [[fallthrough]]; + case 'Z': + case 'z': + b |= 1; + break; + } + } + } + + elem.aVal = a; + elem.bVal = b; + + chars_processed += chars_to_process; + if (chars_processed >= val_length) + break; + } + + return *this; +} + +Port& Port::set_hexstr(const std::string& val) { + auto val_iter = val.crbegin(); // Process from right to left + + size_t chars_processed = 0; + const size_t val_length = val.length(); + + for (auto& elem : buffer) { + uint32_t a = 0; + uint32_t b = 0; + + // Process up to 8 hex characters (32 bits) for this buffer element + const size_t chars_to_process = std::min(8UL, val_length - chars_processed); + + for (size_t j = 0; j < chars_to_process; ++j) { + a <<= 4; + b <<= 4; + + if (val_iter != val.crend()) { + char c = *val_iter++; + + if (('0' <= c) && c <= '9') { + a |= c & 0xF; + } else { + c |= 0x20; // Convert to lowercase + if (('a' <= c) && (c <= 'f')) { + a |= static_cast(c - ('a' - 10)); + } else { + b |= 0xF; + if (c != 'z') { + a |= 0xF; + } + } + } + } + } + + elem.aVal = a; + elem.bVal = b; + + chars_processed += chars_to_process; + if (chars_processed >= val_length) + break; + } + + return *this; +} diff --git a/finn_xsi/finn_xsi/src/SharedLibrary.cpp b/finn_xsi/finn_xsi/src/SharedLibrary.cpp new file mode 100644 index 0000000000..81ce1ff33e --- /dev/null +++ b/finn_xsi/finn_xsi/src/SharedLibrary.cpp @@ -0,0 +1,120 @@ +#include + +#include + +using namespace xsi; + +char const SharedLibrary::library_suffix[] = +#if defined(_WIN32) + ".lib"; +#else + ".so"; +#endif + +#if defined(_WIN32) +namespace { + std::string translate_error_message(DWORD errid) { + std::string msg; + LPTSTR bufptr; + FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, errid, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), &bufptr, 0, nullptr); + if (bufptr) + msg = reinterpret_cast(bufptr); + LocalFree(bufptr); + return msg; + } +} // namespace +#endif + +SharedLibrary& SharedLibrary::open(const std::string& path) { + if (_lib) + throw std::runtime_error("SharedLibrary still open for " + _path); + _lib = load(path); + _path = path; + return *this; +} + +SharedLibrary::handle_type SharedLibrary::load(const std::string& path) { + if (path.empty()) + throw std::domain_error("Empty library path."); + +#if defined(_WIN32) + SetLastError(0); + #ifdef UNICODE + // Use LoadLibraryA explicitly on windows if UNICODE is defined + handle_type const lib = LoadLibraryA(path.c_str()); + #else + handle_type const lib = LoadLibrary(path.c_str()); + #endif + if (!lib) + throw std::runtime_error(translate_error_message(GetLastError())); +#else + handle_type const lib = dlopen(path.c_str(), RTLD_LAZY | RTLD_GLOBAL); + if (!lib) + throw std::runtime_error(dlerror()); +#endif + return lib; +} + +void SharedLibrary::unload() noexcept { + if (_lib) { +#if defined(_WIN32) + FreeLibrary(_lib); +#else + dlclose(_lib); +#endif + } +} + +std::optional SharedLibrary::getsymbol(const char* const name) { + void* sym; +#if defined(_WIN32) + sym = (void*) GetProcAddress(_lib, name); + if (!sym) +#else + dlerror(); // clear error + sym = dlsym(_lib, name); + char const* const err = dlerror(); + if (err) +#endif + return std::nullopt; + return std::make_optional(sym); +} + +// Constructors +SharedLibrary::SharedLibrary() : _lib(nullptr), _path() {} + +SharedLibrary::SharedLibrary(const std::string& path) : _lib(load(path)), _path(path) {} + +// Destructor +SharedLibrary::~SharedLibrary() { unload(); } + +// Move constructor +SharedLibrary::SharedLibrary(SharedLibrary&& other) noexcept : _lib(other._lib), _path(std::move(other._path)) { other._lib = nullptr; } + +// Move assignment operator +SharedLibrary& SharedLibrary::operator=(SharedLibrary&& other) noexcept { + if (this != &other) { + // Clean up current state + unload(); + + // Move from other + _lib = other._lib; + _path = std::move(other._path); + + // Reset other + other._lib = nullptr; + } + return *this; +} + +// Member functions +SharedLibrary::operator bool() const noexcept { return bool(_lib); } + +SharedLibrary& SharedLibrary::close() noexcept { + unload(); + _lib = nullptr; + _path.clear(); + return *this; +} + +const std::string& SharedLibrary::path() const noexcept { return _path; } diff --git a/finn_xsi/finn_xsi/src/SocketServer.cpp b/finn_xsi/finn_xsi/src/SocketServer.cpp new file mode 100644 index 0000000000..e5fafe3997 --- /dev/null +++ b/finn_xsi/finn_xsi/src/SocketServer.cpp @@ -0,0 +1,143 @@ +#include +#include +#include +#include + +#include +#include +#include + +SocketServer::SocketServer(std::string_view path) : socket_path(path) {} + +SocketServer::~SocketServer() { close_connection(); } + +SocketServer::SocketServer(SocketServer&& other) noexcept + : server_fd(std::exchange(other.server_fd, -1)), client_fd(std::exchange(other.client_fd, -1)), socket_path(std::move(other.socket_path)) {} + +SocketServer& SocketServer::operator=(SocketServer&& other) noexcept { + if (this != &other) { + close_connection(); + server_fd = std::exchange(other.server_fd, -1); + client_fd = std::exchange(other.client_fd, -1); + socket_path = std::move(other.socket_path); + } + return *this; +} + +void SocketServer::close_fd(int& fd) noexcept { + if (fd >= 0) { + ::close(fd); + fd = -1; + } +} + +std::optional SocketServer::initialize() { + // Create socket + server_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (server_fd < 0) { + return "Failed to create socket: " + std::string(strerror(errno)); + } + + // Remove existing socket file + unlink(socket_path.c_str()); + + // Bind socket + sockaddr_un addr{}; + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, socket_path.c_str(), sizeof(addr.sun_path) - 1); + + if (bind(server_fd, reinterpret_cast(&addr), sizeof(addr)) < 0) { + std::string error = "Failed to bind socket: " + std::string(strerror(errno)); + close_fd(server_fd); + return error; + } + + // Listen + if (listen(server_fd, 1) < 0) { + std::string error = "Failed to listen on socket: " + std::string(strerror(errno)); + close_fd(server_fd); + return error; + } + + // Accept connection + client_fd = accept(server_fd, nullptr, nullptr); + if (client_fd < 0) { + std::string error = "Failed to accept connection: " + std::string(strerror(errno)); + close_fd(server_fd); + return error; + } + + return std::nullopt; // Success +} + +std::optional SocketServer::receive_message() { + if (client_fd < 0) { + std::cerr << "Socket not connected" << std::endl; + return std::nullopt; + } + + // Read length prefix + uint32_t length{}; + const ssize_t bytes_read = read(client_fd, &length, sizeof(length)); + if (bytes_read != sizeof(length)) { + if (bytes_read == 0) { + std::cerr << "Connection closed by client" << std::endl; + } else { + std::cerr << "Failed to read message length: " << strerror(errno) << std::endl; + } + return std::nullopt; + } + + // Read message + std::string buffer(length, '\0'); + size_t total_read = 0; + while (total_read < length) { + const ssize_t n = read(client_fd, buffer.data() + total_read, length - total_read); + if (n <= 0) { + std::cerr << "Failed to read message data: " << strerror(errno) << std::endl; + return std::nullopt; + } + total_read += static_cast(n); + } + + try { + return json::parse(buffer); + } catch (const json::exception& e) { + std::cerr << "Failed to parse JSON: " << e.what() << std::endl; + return std::nullopt; + } +} + +void SocketServer::send_message(const json& message) { + if (client_fd < 0) { + std::cerr << "Socket not connected" << std::endl; + return; + } + + const std::string msg_str = message.dump(); + const uint32_t length = static_cast(msg_str.size()); + + // Send length prefix + const ssize_t bytes_written = write(client_fd, &length, sizeof(length)); + if (bytes_written != sizeof(length)) { + std::cerr << "Failed to write message length: " << strerror(errno) << std::endl; + return; + } + + // Send message + size_t total_written = 0; + while (total_written < length) { + const ssize_t n = write(client_fd, msg_str.data() + total_written, length - total_written); + if (n <= 0) { + std::cerr << "Failed to write message data: " << strerror(errno) << std::endl; + return; + } + total_written += static_cast(n); + } +} + +void SocketServer::close_connection() noexcept { + close_fd(client_fd); + close_fd(server_fd); + unlink(socket_path.c_str()); +} diff --git a/finn_xsi/finn_xsi/unittests/CMakeLists.txt b/finn_xsi/finn_xsi/unittests/CMakeLists.txt new file mode 100644 index 0000000000..20a699ae33 --- /dev/null +++ b/finn_xsi/finn_xsi/unittests/CMakeLists.txt @@ -0,0 +1,40 @@ +# Enable testing +enable_testing() + +# Fetch Google Test +include(FetchContent) +FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG v1.14.0 +) +# For Windows: Prevent overriding the parent project's compiler/linker settings +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + +# Add FIFO unit tests +add_executable(FIFO_test FIFO_test.cpp ${CORE_SRC}) +target_link_libraries(FIFO_test PRIVATE nlohmann_json::nlohmann_json GTest::gtest_main Threads::Threads -ldl -lrt) +target_include_directories(FIFO_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include) +target_include_directories(FIFO_test PUBLIC "$ENV{XILINX_VIVADO}/data/xsim/include") + +# Add InterprocessCommunicationChannel unit tests +add_executable(InterprocessCommunicationChannel_test InterprocessCommunicationChannel_test.cpp) +target_link_libraries(InterprocessCommunicationChannel_test PRIVATE GTest::gtest_main Threads::Threads -ldl -lrt nlohmann_json::nlohmann_json) +target_include_directories(InterprocessCommunicationChannel_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include ${Boost_INCLUDE_DIRS}) + +# Add Integration tests (FIFO + InterSimulationInterface) +add_executable(Integration_test Integration_test.cpp ${CORE_SRC}) +target_link_libraries(Integration_test PRIVATE nlohmann_json::nlohmann_json GTest::gtest_main Threads::Threads -ldl -lrt) +target_include_directories(Integration_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include ${Boost_INCLUDE_DIRS}) +target_include_directories(Integration_test PUBLIC "$ENV{XILINX_VIVADO}/data/xsim/include") + +# Register tests with CTest +include(GoogleTest) +gtest_discover_tests(FIFO_test) +gtest_discover_tests(InterprocessCommunicationChannel_test) +gtest_discover_tests(Integration_test) + +# Create a target to build all unittests at once +add_custom_target(all_unittests) +add_dependencies(all_unittests FIFO_test InterprocessCommunicationChannel_test Integration_test) diff --git a/finn_xsi/finn_xsi/unittests/FIFO_test.cpp b/finn_xsi/finn_xsi/unittests/FIFO_test.cpp new file mode 100644 index 0000000000..835ab0dd63 --- /dev/null +++ b/finn_xsi/finn_xsi/unittests/FIFO_test.cpp @@ -0,0 +1,854 @@ +#include "FIFO.h" + +#include + +// Test fixture for FIFO tests +class FIFOTest : public ::testing::Test { + protected: + void SetUp() override { + // Setup code if needed + } + + void TearDown() override { + // Cleanup code if needed + } +}; + +// ===== Constructor and Initialization Tests ===== + +TEST_F(FIFOTest, ConstructorWithDefaultSize) { + FIFO fifo; + EXPECT_TRUE(fifo.isEmpty()); + EXPECT_TRUE(fifo.getInputReady()); + EXPECT_FALSE(fifo.getOutputValid()); +} + +TEST_F(FIFOTest, ConstructorWithSpecificSize) { + FIFO fifo(10); + EXPECT_TRUE(fifo.isEmpty()); + EXPECT_TRUE(fifo.getInputReady()); + EXPECT_FALSE(fifo.getOutputValid()); + EXPECT_EQ(fifo.getSpaceLeft(), 10); +} + +TEST_F(FIFOTest, ConstructorWithZeroSize) { + FIFO fifo(0); + EXPECT_TRUE(fifo.isEmpty()); + EXPECT_FALSE(fifo.getInputReady()); + EXPECT_FALSE(fifo.getOutputValid()); + EXPECT_EQ(fifo.getSpaceLeft(), 0); +} + +// ===== Reset Tests ===== + +TEST_F(FIFOTest, ResetClearsState) { + FIFO fifo(10); + fifo.update(true, false); // Add one element + fifo.toggleClock(); + EXPECT_FALSE(fifo.isEmpty()); + + fifo.reset(10); + EXPECT_TRUE(fifo.isEmpty()); + EXPECT_EQ(fifo.getSpaceLeft(), 10); +} + +TEST_F(FIFOTest, ResetChangesSize) { + FIFO fifo(10); + fifo.reset(20); + EXPECT_EQ(fifo.getSpaceLeft(), 20); +} + +TEST_F(FIFOTest, SetMaxSize) { + FIFO fifo(10); + fifo.setMaxSize(15); + EXPECT_EQ(fifo.getSpaceLeft(), 15); +} + +// ===== Basic Update and Toggle Tests ===== + +TEST_F(FIFOTest, PushOneElement) { + FIFO fifo(10); + fifo.update(true, false); // Push (valid=true, ready=false) + fifo.toggleClock(); + + EXPECT_FALSE(fifo.isEmpty()); + EXPECT_TRUE(fifo.getOutputValid()); + EXPECT_TRUE(fifo.getInputReady()); + EXPECT_EQ(fifo.getSpaceLeft(), 9); +} + +TEST_F(FIFOTest, PopOneElement) { + FIFO fifo(10); + // First push an element + fifo.update(true, false); + fifo.toggleClock(); + + // Then pop it + fifo.update(false, true); // Pop (valid=false, ready=true) + fifo.toggleClock(); + + EXPECT_TRUE(fifo.isEmpty()); + EXPECT_FALSE(fifo.getOutputValid()); + EXPECT_EQ(fifo.getSpaceLeft(), 10); +} + +TEST_F(FIFOTest, PushAndPopSimultaneously) { + FIFO fifo(10); + // First push an element + fifo.update(true, false); + fifo.toggleClock(); + + // Now push and pop simultaneously (FIFO size should stay the same) + fifo.update(true, true); + fifo.toggleClock(); + + EXPECT_FALSE(fifo.isEmpty()); + EXPECT_TRUE(fifo.getOutputValid()); + EXPECT_EQ(fifo.getSpaceLeft(), 9); +} + +// ===== Boundary Condition Tests ===== + +TEST_F(FIFOTest, FillToCapacity) { + FIFO fifo(3); + + for (int i = 0; i < 3; ++i) { + fifo.update(true, false); + fifo.toggleClock(); + } + + EXPECT_FALSE(fifo.isEmpty()); + EXPECT_TRUE(fifo.getOutputValid()); + EXPECT_FALSE(fifo.getInputReady()); + EXPECT_EQ(fifo.getSpaceLeft(), 0); +} + +TEST_F(FIFOTest, CannotPushWhenFull) { + FIFO fifo(2); + + // Fill the FIFO + fifo.update(true, false); + fifo.toggleClock(); + fifo.update(true, false); + fifo.toggleClock(); + + EXPECT_FALSE(fifo.getInputReady()); + + // Try to push when full (should have no effect) + fifo.update(true, false); + fifo.toggleClock(); + + EXPECT_EQ(fifo.getSpaceLeft(), 0); +} + +TEST_F(FIFOTest, CanPushAndPullWhenFull) { + FIFO fifo(2); + + // Fill the FIFO + fifo.update(true, false); + fifo.toggleClock(); + fifo.update(true, false); + fifo.toggleClock(); + + EXPECT_FALSE(fifo.getInputReady()); + + // Try to push and pull when full (should have no effect) + fifo.update(true, true); + fifo.toggleClock(); + + EXPECT_EQ(fifo.getSpaceLeft(), 1); + + fifo.reset(2); + + // Fill the FIFO + fifo.update(true, false); + fifo.toggleClock(); + fifo.update(true, false); + fifo.toggleClock(); + + EXPECT_FALSE(fifo.getInputReady()); + + // Try to push and pull when full (should have no effect) + fifo.update(false, true); + fifo.toggleClock(); + + EXPECT_EQ(fifo.getSpaceLeft(), 1); +} + +TEST_F(FIFOTest, CannotPopWhenEmpty) { + FIFO fifo(10); + + EXPECT_TRUE(fifo.isEmpty()); + + // Try to pop when empty (should have no effect) + fifo.update(false, true); + fifo.toggleClock(); + + EXPECT_TRUE(fifo.isEmpty()); + EXPECT_EQ(fifo.getSpaceLeft(), 10); +} + +TEST_F(FIFOTest, CanPushAndPopWhenEmpty) { + FIFO fifo(10); + + EXPECT_TRUE(fifo.isEmpty()); + + // Try to pop when empty (should have no effect) + fifo.update(true, true); + fifo.toggleClock(); + + EXPECT_FALSE(fifo.isEmpty()); + EXPECT_TRUE(fifo.getOutputValid()); + EXPECT_EQ(fifo.getSpaceLeft(), 9); +} + +TEST_F(FIFOTest, PopWhenFullMakesSpaceAvailable) { + FIFO fifo(2); + + // Fill the FIFO + fifo.update(true, false); + fifo.toggleClock(); + fifo.update(true, false); + fifo.toggleClock(); + + EXPECT_FALSE(fifo.getInputReady()); + + // Pop one element + fifo.update(false, true); + fifo.toggleClock(); + + EXPECT_TRUE(fifo.getInputReady()); + EXPECT_EQ(fifo.getSpaceLeft(), 1); +} + +// ===== Sequential Operation Tests ===== + +TEST_F(FIFOTest, SequentialPushAndPop) { + FIFO fifo(5); + + // Push 3 elements + for (int i = 0; i < 3; ++i) { + fifo.update(true, false); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.getSpaceLeft(), 2); + + // Pop 2 elements + for (int i = 0; i < 2; ++i) { + fifo.update(false, true); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.getSpaceLeft(), 4); + + // Pop 1 more + fifo.update(false, true); + fifo.toggleClock(); + EXPECT_TRUE(fifo.isEmpty()); +} + +TEST_F(FIFOTest, AlternatingPushPop) { + FIFO fifo(10); + + for (int i = 0; i < 5; ++i) { + // Push + fifo.update(true, false); + fifo.toggleClock(); + EXPECT_FALSE(fifo.isEmpty()); + + // Pop + fifo.update(false, true); + fifo.toggleClock(); + EXPECT_TRUE(fifo.isEmpty()); + } +} + +TEST_F(FIFOTest, StreamingOperation) { + FIFO fifo(10); + + // Push one element first + fifo.update(true, false); + fifo.toggleClock(); + + // Now stream: push and pop simultaneously for multiple cycles + for (int i = 0; i < 100; ++i) { + fifo.update(true, true); + fifo.toggleClock(); + EXPECT_EQ(fifo.getSpaceLeft(), 9); // Size should remain constant + } +} + +// ===== State Query Tests ===== + +TEST_F(FIFOTest, IsEmptyCorrectly) { + FIFO fifo(5); + EXPECT_TRUE(fifo.isEmpty()); + + fifo.update(true, false); + fifo.toggleClock(); + EXPECT_FALSE(fifo.isEmpty()); + + fifo.update(false, true); + fifo.toggleClock(); + EXPECT_TRUE(fifo.isEmpty()); +} + +TEST_F(FIFOTest, IsInputReadyCorrectly) { + FIFO fifo(2); + EXPECT_TRUE(fifo.getInputReady()); + + fifo.update(true, false); + fifo.toggleClock(); + EXPECT_TRUE(fifo.getInputReady()); + + fifo.update(true, false); + fifo.toggleClock(); + EXPECT_FALSE(fifo.getInputReady()); +} + +TEST_F(FIFOTest, IsOutputValidCorrectly) { + FIFO fifo(5); + EXPECT_FALSE(fifo.getOutputValid()); + + fifo.update(true, false); + fifo.toggleClock(); + EXPECT_TRUE(fifo.getOutputValid()); + + fifo.update(false, true); + fifo.toggleClock(); + EXPECT_FALSE(fifo.getOutputValid()); +} + +TEST_F(FIFOTest, GetSpaceLeftCorrectly) { + FIFO fifo(10); + EXPECT_EQ(fifo.getSpaceLeft(), 10); + + for (int i = 0; i < 3; ++i) { + fifo.update(true, false); + fifo.toggleClock(); + EXPECT_EQ(fifo.getSpaceLeft(), 10 - i - 1); + } + + fifo.update(false, true); + fifo.toggleClock(); + EXPECT_EQ(fifo.getSpaceLeft(), 8); +} + +// ===== Edge Case Tests ===== + +TEST_F(FIFOTest, NoUpdateBeforeToggle) { + FIFO fifo(10); + fifo.toggleClock(); // Toggle without update + + EXPECT_TRUE(fifo.isEmpty()); + EXPECT_EQ(fifo.getSpaceLeft(), 10); +} + +TEST_F(FIFOTest, LargeCapacity) { + FIFO fifo(1000000); + EXPECT_EQ(fifo.getSpaceLeft(), 1000000); + + for (int i = 0; i < 100; ++i) { + fifo.update(true, false); + fifo.toggleClock(); + } + + EXPECT_EQ(fifo.getSpaceLeft(), 999900); +} + +// ===== IncreaseCounter Tests ===== + +TEST_F(FIFOTest, IncreaseCounterBasic) { + FIFO fifo(100); + + fifo.update(true, false); + fifo.toggleClock(); + EXPECT_EQ(fifo.getSpaceLeft(), 99); + + fifo.increaseCounter(5); + fifo.toggleClock(); + EXPECT_EQ(fifo.getSpaceLeft(), 94); +} + +TEST_F(FIFOTest, IncreaseCounterOnEmptyFIFO) { + FIFO fifo(100); + + fifo.increaseCounter(10); + fifo.toggleClock(); + EXPECT_EQ(fifo.getSpaceLeft(), 90); +} + +TEST_F(FIFOTest, IncreaseCounterZero) { + FIFO fifo(100); + + fifo.update(true, false); + fifo.toggleClock(); + + fifo.increaseCounter(0); + fifo.toggleClock(); + EXPECT_EQ(fifo.getSpaceLeft(), 99); +} + +// ===== Complex Scenarios ===== + +TEST_F(FIFOTest, BurstTrafficPattern) { + FIFO fifo(20); + + // Burst of 10 pushes + for (int i = 0; i < 10; ++i) { + fifo.update(true, false); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.getSpaceLeft(), 10); + + // Burst of 10 pops + for (int i = 0; i < 10; ++i) { + fifo.update(false, true); + fifo.toggleClock(); + } + EXPECT_TRUE(fifo.isEmpty()); +} + +TEST_F(FIFOTest, StressTestManyOperations) { + FIFO fifo(100); + + // Perform 1000 operations + for (int i = 0; i < 500; ++i) { + fifo.update(true, false); + fifo.toggleClock(); + } + + for (int i = 0; i < 500; ++i) { + fifo.update(false, true); + fifo.toggleClock(); + } + + EXPECT_TRUE(fifo.isEmpty()); +} + +// ===== Multiple FIFO Instances ===== + +TEST_F(FIFOTest, MultipleFIFOsIndependent) { + FIFO fifo1(10); + FIFO fifo2(20); + + fifo1.update(true, false); + fifo1.toggleClock(); + + EXPECT_EQ(fifo1.getSpaceLeft(), 9); + EXPECT_EQ(fifo2.getSpaceLeft(), 20); + + fifo2.update(true, false); + fifo2.update(true, false); + fifo2.toggleClock(); + fifo2.toggleClock(); + + // fifo2 should have 2 elements (last update takes effect) + EXPECT_EQ(fifo1.getSpaceLeft(), 9); + EXPECT_TRUE(fifo2.getSpaceLeft() < 20); +} + +// ===== Individual Method Tests ===== + +TEST_F(FIFOTest, TryPushBasic) { + FIFO fifo(10); + EXPECT_EQ(fifo.size(), 0); + + fifo.setInputValid(true); + fifo.toggleClock(); + + EXPECT_EQ(fifo.size(), 1); + EXPECT_FALSE(fifo.isEmpty()); + EXPECT_TRUE(fifo.getOutputValid()); +} + +TEST_F(FIFOTest, TryPushFalseDoesNothing) { + FIFO fifo(10); + + fifo.setInputValid(false); + fifo.toggleClock(); + + EXPECT_EQ(fifo.size(), 0); + EXPECT_TRUE(fifo.isEmpty()); +} + +TEST_F(FIFOTest, TryPushMultiple) { + FIFO fifo(10); + + for (int i = 0; i < 5; ++i) { + fifo.setInputValid(true); + fifo.toggleClock(); + } + + EXPECT_EQ(fifo.size(), 5); + EXPECT_EQ(fifo.getSpaceLeft(), 5); +} + +TEST_F(FIFOTest, TryPushWhenFull) { + FIFO fifo(3); + + // Fill the FIFO + for (int i = 0; i < 3; ++i) { + fifo.setInputValid(true); + fifo.toggleClock(); + } + + EXPECT_EQ(fifo.size(), 3); + EXPECT_FALSE(fifo.getInputReady()); + + // Try to push when full (should have no effect) + fifo.setInputValid(true); + fifo.toggleClock(); + + EXPECT_EQ(fifo.size(), 3); +} + +TEST_F(FIFOTest, TryPopBasic) { + FIFO fifo(10); + + // First push an element + fifo.setInputValid(true); + fifo.toggleClock(); + EXPECT_EQ(fifo.size(), 1); + + // Then pop it + fifo.setOutputReady(true); + fifo.toggleClock(); + + EXPECT_EQ(fifo.size(), 0); + EXPECT_TRUE(fifo.isEmpty()); +} + +TEST_F(FIFOTest, TryPopFalseDoesNothing) { + FIFO fifo(10); + + fifo.setInputValid(true); + fifo.toggleClock(); + + fifo.setOutputReady(false); + fifo.toggleClock(); + + EXPECT_EQ(fifo.size(), 1); +} + +TEST_F(FIFOTest, TryPopWhenEmpty) { + FIFO fifo(10); + + EXPECT_TRUE(fifo.isEmpty()); + + // Try to pop when empty (should have no effect) + fifo.setOutputReady(true); + fifo.toggleClock(); + + EXPECT_TRUE(fifo.isEmpty()); + EXPECT_EQ(fifo.size(), 0); +} + +TEST_F(FIFOTest, TryPushAndTryPopSameCycle) { + FIFO fifo(10); + + // Push first element + fifo.setInputValid(true); + fifo.toggleClock(); + EXPECT_EQ(fifo.size(), 1); + + // Push and pop in same cycle (order: push then pop) + fifo.setInputValid(true); + fifo.setOutputReady(true); + fifo.toggleClock(); + + // Should still have 1 element (pushed 1, popped 1) + EXPECT_EQ(fifo.size(), 1); + + // Push and pop in same cycle (order: push then pop) + fifo.setOutputReady(true); + fifo.setInputValid(true); + fifo.toggleClock(); + + // Should still have 1 element (pushed 1, popped 1) + EXPECT_EQ(fifo.size(), 1); +} + +TEST_F(FIFOTest, TryPushAndTryPopSameCycleEmptyFIFO) { + FIFO fifo(10); + + // Push and pop in same cycle (order: push then pop) + fifo.setInputValid(true); + fifo.setOutputReady(true); + fifo.toggleClock(); + + // Should still have 1 element (pushed 1, popped 0, because was empty) + EXPECT_EQ(fifo.size(), 1); + + fifo.reset(10); + // Push and pop in same cycle (order: push then pop) + fifo.setInputValid(true); + fifo.setOutputReady(false); + fifo.toggleClock(); + + // Should still have 0 element (pushed 1, popped 0) + EXPECT_EQ(fifo.size(), 1); +} + +TEST_F(FIFOTest, TryPushAndTryPopSameCycleFullFIFO) { + FIFO fifo(1); + + // Push first element + fifo.setInputValid(true); + fifo.toggleClock(); + EXPECT_EQ(fifo.size(), 1); + EXPECT_FALSE(fifo.getInputReady()); + + // Push and pop in same cycle (order: push then pop) + fifo.setInputValid(true); + fifo.setOutputReady(true); + fifo.toggleClock(); + + // Should still have 1 element (pushed 1, popped 1) + EXPECT_EQ(fifo.size(), 0); + + fifo.reset(1); + + // Push first element + fifo.setInputValid(true); + fifo.toggleClock(); + EXPECT_EQ(fifo.size(), 1); + EXPECT_FALSE(fifo.getInputReady()); + + // Push and pop in same cycle (order: push then pop) + fifo.setInputValid(false); + fifo.setOutputReady(true); + fifo.toggleClock(); + + // Should still have 1 element (pushed 1, popped 1) + EXPECT_EQ(fifo.size(), 0); +} + +TEST_F(FIFOTest, TryPushAndTryPopSequence) { + FIFO fifo(10); + + // Push 3 + for (int i = 0; i < 3; ++i) { + fifo.setInputValid(true); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.size(), 3); + + // Pop 2 + for (int i = 0; i < 2; ++i) { + fifo.setOutputReady(true); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.size(), 1); + + // Push 1 more + fifo.setInputValid(true); + fifo.toggleClock(); + EXPECT_EQ(fifo.size(), 2); +} + +TEST_F(FIFOTest, TryPushAndTryPopStreaming) { + FIFO fifo(10); + + // Initialize with one element + fifo.setInputValid(true); + fifo.toggleClock(); + + // Stream: push and pop simultaneously for many cycles + for (int i = 0; i < 100; ++i) { + fifo.setInputValid(true); + fifo.setOutputReady(true); + fifo.toggleClock(); + EXPECT_EQ(fifo.size(), 1); // Size should remain constant + } +} + +TEST_F(FIFOTest, TryPushAlternatingValid) { + FIFO fifo(10); + + for (int i = 0; i < 10; ++i) { + fifo.setInputValid(i % 2 == 0); // Push only on even iterations + fifo.toggleClock(); + } + + EXPECT_EQ(fifo.size(), 5); // Should have 5 elements +} + +TEST_F(FIFOTest, TryPopAlternatingReady) { + FIFO fifo(10); + + // Fill with 6 elements + for (int i = 0; i < 6; ++i) { + fifo.setInputValid(true); + fifo.toggleClock(); + } + + // Pop alternating + for (int i = 0; i < 10; ++i) { + fifo.setOutputReady(i % 2 == 0); // Pop only on even iterations + fifo.toggleClock(); + } + + EXPECT_EQ(fifo.size(), 1); // 6 - 5 pops = 1 +} + +TEST_F(FIFOTest, SizeMethodCorrectness) { + FIFO fifo(20); + + EXPECT_EQ(fifo.size(), 0); + + for (int i = 1; i <= 10; ++i) { + fifo.setInputValid(true); + fifo.toggleClock(); + EXPECT_EQ(fifo.size(), i); + } + + for (int i = 9; i >= 0; --i) { + fifo.setOutputReady(true); + fifo.toggleClock(); + EXPECT_EQ(fifo.size(), i); + } +} + +TEST_F(FIFOTest, TryMethodsVsUpdateEquivalence) { + FIFO fifo1(10); + FIFO fifo2(10); + + // Use update() on fifo1 + fifo1.update(true, false); // Push + fifo1.toggleClock(); + fifo1.update(true, false); // Push + fifo1.toggleClock(); + fifo1.update(false, true); // Pop + fifo1.toggleClock(); + + // Use tryPush/tryPop on fifo2 + fifo2.setInputValid(true); + fifo2.toggleClock(); + fifo2.setInputValid(true); + fifo2.toggleClock(); + fifo2.setOutputReady(true); + fifo2.toggleClock(); + + // Should have same result + EXPECT_EQ(fifo1.size(), fifo2.size()); + EXPECT_EQ(fifo1.isEmpty(), fifo2.isEmpty()); + EXPECT_EQ(fifo1.getOutputValid(), fifo2.getOutputValid()); +} + +TEST_F(FIFOTest, TryMethodsBurstPattern) { + FIFO fifo(50); + + // Burst of pushes + for (int i = 0; i < 30; ++i) { + fifo.setInputValid(true); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.size(), 30); + + // Burst of pops + for (int i = 0; i < 20; ++i) { + fifo.setOutputReady(true); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.size(), 10); + + // Mixed burst + for (int i = 0; i < 15; ++i) { + fifo.setInputValid(true); + fifo.setOutputReady(true); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.size(), 10); // Should remain constant +} + +TEST_F(FIFOTest, TryMethodsStressTest) { + FIFO fifo(1000); + + // Complex pattern + for (int i = 0; i < 500; ++i) { + fifo.setInputValid(i % 3 != 0); // Push 2 out of 3 times + if (i > 100) { + fifo.setOutputReady(i % 2 == 0); // Pop every other time after 100 + } + fifo.toggleClock(); + } + + // Verify FIFO is in valid state + EXPECT_LE(fifo.size(), 1000); + EXPECT_EQ(fifo.size() == 0, fifo.isEmpty()); + EXPECT_EQ(fifo.size() > 0, fifo.getOutputValid()); +} + +TEST_F(FIFOTest, TryMethodsEdgeCaseFullToEmpty) { + FIFO fifo(5); + + // Fill completely + for (int i = 0; i < 5; ++i) { + fifo.setInputValid(true); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.size(), 5); + EXPECT_FALSE(fifo.getInputReady()); + + // Empty completely + for (int i = 0; i < 5; ++i) { + fifo.setOutputReady(true); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.size(), 0); + EXPECT_TRUE(fifo.isEmpty()); + EXPECT_FALSE(fifo.getOutputValid()); +} + +TEST_F(FIFOTest, TryMethodsWithReset) { + FIFO fifo(10); + + // Add some elements + for (int i = 0; i < 5; ++i) { + fifo.setInputValid(true); + fifo.toggleClock(); + } + EXPECT_EQ(fifo.size(), 5); + + // Reset + fifo.reset(10); + EXPECT_EQ(fifo.size(), 0); + + // Should work normally after reset + fifo.setInputValid(true); + fifo.toggleClock(); + EXPECT_EQ(fifo.size(), 1); +} + +TEST_F(FIFOTest, TestTimeout){ + FIFO fifo(10); + fifo.setCyclesUntilExpectedFirstValid(3); + EXPECT_TRUE(fifo.toggleClock()); // 3 cycles left + EXPECT_TRUE(fifo.toggleClock()); // 2 cycles left + EXPECT_FALSE(fifo.toggleClock()); // 0 cycles left, should return false + + fifo.reset(10); + fifo.setCyclesUntilExpectedFirstValid(2); + EXPECT_TRUE(fifo.toggleClock()); // 2 cycles left + fifo.update(true, false); // Set valid, should disable timeout + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + + fifo.reset(10); + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true + EXPECT_TRUE(fifo.toggleClock()); // Still should return true +} + +// Main function to run all tests +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/finn_xsi/finn_xsi/unittests/Integration_test.cpp b/finn_xsi/finn_xsi/unittests/Integration_test.cpp new file mode 100644 index 0000000000..2edb9d38da --- /dev/null +++ b/finn_xsi/finn_xsi/unittests/Integration_test.cpp @@ -0,0 +1,642 @@ +#include +#include +#include + +#include + +#include "FIFO.h" +#include "InterprocessCommunicationChannel.hpp" + +// Test fixture for integration tests +class IntegrationTest : public ::testing::Test { + protected: + std::string shmName; + + void SetUp() override { + // Generate unique shared memory name for each test + shmName = "test_shm_integration_" + std::to_string(getpid()); + + // Clean up any leftover shared memory from previous runs + boost::interprocess::shared_memory_object::remove(shmName.c_str()); + } + + void TearDown() override { + // Clean up shared memory after test + boost::interprocess::shared_memory_object::remove(shmName.c_str()); + } +}; + +class SimDummy { + bool currentValid = false; + bool currentReady = true; + bool nextValid = false; + bool nextReady = true; + + public: + bool isOutputValid() const { return currentValid; } + void toggleClock() { + currentValid = nextValid; + currentReady = nextReady; + } + bool isInputReady() const { return currentReady; } + void setNextValid(bool v) { nextValid = v; } + void setNextReady(bool r) { nextReady = r; } +}; + +// ===== Basic Integration Tests ===== + +TEST_F(IntegrationTest, OneCycleReadyFalseValidFalse) { + // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair + // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation + + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver with FIFO output to SimDummy + int receivedCount = 0; + { + InterprocessCommunicationChannel receiver(shmName); + FIFO outputFifo(15); + SimDummy simDummy; + + simDummy.setNextReady(false); + bool readySignal = outputFifo.getInputReady(); + bool validSignal = receiver.receive_request(); + if (validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 1 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 15) { + exit(3); + } + if (outputFifo.getInputReady() != true) { + exit(4); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (simDummy.isOutputValid()) { + exit(1); + } + } // Destructor called here + exit(0); + } + + // Parent process: Sender + { + InterprocessCommunicationChannel sender(shmName); + + bool validSignal = false; + bool incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 0; expect ready==false for cycle 1 + + } // Destructor called here + + // Wait for child + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(IntegrationTest, OneCycleReadyTrueValidFalse) { + // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair + // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation + + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver with FIFO output to SimDummy + int receivedCount = 0; + { + InterprocessCommunicationChannel receiver(shmName); + FIFO outputFifo(15); + SimDummy simDummy; + + simDummy.setNextReady(true); + bool readySignal = outputFifo.getInputReady(); + bool validSignal = receiver.receive_request(); + if (validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 1 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 15) { + exit(3); + } + if (outputFifo.getInputReady() != true) { + exit(4); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (simDummy.isOutputValid()) { + exit(1); + } + } // Destructor called here + exit(0); + } + + // Parent process: Sender + { + InterprocessCommunicationChannel sender(shmName); + + bool validSignal = false; + bool incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); + + } // Destructor called here + + // Wait for child + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(IntegrationTest, OneCycleReadyFalseValidTrue) { + // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair + // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation + + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver with FIFO output to SimDummy + int receivedCount = 0; + { + InterprocessCommunicationChannel receiver(shmName); + FIFO outputFifo(15); + SimDummy simDummy; + + simDummy.setNextReady(false); + bool readySignal = outputFifo.getInputReady(); + bool validSignal = receiver.receive_request(); + if (!validSignal) { // It is correct that valid is true here, because we only have a single cycle and the sender input is set to valid in cycle 0. Therefore, we should + // receive a valid in cycle 0. + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 1 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 14) { + exit(3); + } + if (outputFifo.getInputReady() != true) { + exit(4); + } + if (!outputFifo.getOutputValid()) { + exit(5); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (!simDummy.isOutputValid()) { + exit(1); + } + } // Destructor called here + exit(0); + } + + // Parent process: Sender + { + InterprocessCommunicationChannel sender(shmName); + + bool validSignal = true; + bool incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 0; expect ready==true for cycle 1 + + } // Destructor called here + + // Wait for child + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(IntegrationTest, OneCycleReadyTrueValidTrue) { + // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair + // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation + + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver with FIFO output to SimDummy + int receivedCount = 0; + { + InterprocessCommunicationChannel receiver(shmName); + FIFO outputFifo(15); + SimDummy simDummy; + + simDummy.setNextReady(true); + bool readySignal = outputFifo.getInputReady(); + bool validSignal = receiver.receive_request(); + if (!validSignal) { // It is correct that valid is true here, because we only have a single cycle and the sender input is set to valid in cycle 0. Therefore, we should + // receive a valid in cycle 0. + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 1 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 14) { + exit(3); + } + if (outputFifo.getInputReady() != true) { + exit(4); + } + if (!outputFifo.getOutputValid()) { + exit(5); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (!simDummy.isOutputValid()) { + exit(1); + } + } // Destructor called here + exit(0); + } + + // Parent process: Sender + { + InterprocessCommunicationChannel sender(shmName); + + bool validSignal = true; + bool incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); + + } // Destructor called here + + // Wait for child + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); +} + +// ===== Multicycle Integration Tests ===== + +TEST_F(IntegrationTest, TwoCycleReadyFalseValidFalse) { + // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair + // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation + + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver with FIFO output to SimDummy + int receivedCount = 0; + { + InterprocessCommunicationChannel receiver(shmName); + FIFO outputFifo(15); + SimDummy simDummy; + + simDummy.setNextReady(false); + bool readySignal = outputFifo.getInputReady(); + bool validSignal = receiver.receive_request(); + if (validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 1 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 15) { + exit(3); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (simDummy.isOutputValid()) { + exit(1); + } + + simDummy.setNextReady(false); + readySignal = outputFifo.getInputReady(); // Should be true + validSignal = receiver.receive_request(); + if (validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 2 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 15) { + exit(3); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (simDummy.isOutputValid()) { + exit(1); + } + + } // Destructor called here + exit(0); + } + + // Parent process: Sender + { + InterprocessCommunicationChannel sender(shmName); + + bool validSignal = false; + bool incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 0; expect ready==true for cycle 1 + incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 1; expect ready==true for cycle 2 + + } // Destructor called here + + // Wait for child + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(IntegrationTest, TwoCycleReadyTrueValidFalse) { + // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair + // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation + + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver with FIFO output to SimDummy + int receivedCount = 0; + { + InterprocessCommunicationChannel receiver(shmName); + FIFO outputFifo(15); + SimDummy simDummy; + + simDummy.setNextReady(true); + bool readySignal = outputFifo.getInputReady(); + bool validSignal = receiver.receive_request(); + if (validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 1 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 15) { + exit(3); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (simDummy.isOutputValid()) { + exit(1); + } + + simDummy.setNextReady(true); + readySignal = outputFifo.getInputReady(); // Should be true now + validSignal = receiver.receive_request(); + if (validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 2 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 15) { + exit(3); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (simDummy.isOutputValid()) { + exit(1); + } + + } // Destructor called here + exit(0); + } + + // Parent process: Sender + { + InterprocessCommunicationChannel sender(shmName); + + bool validSignal = false; + bool incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 0; expect ready==true for cycle 1 + incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 1; expect ready==true for cycle 2 + + } // Destructor called here + + // Wait for child + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(IntegrationTest, TwoCycleReadyFalseValidTrue) { + // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair + // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation + + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver with FIFO output to SimDummy + int receivedCount = 0; + { + InterprocessCommunicationChannel receiver(shmName); + FIFO outputFifo(15); + SimDummy simDummy; + + simDummy.setNextReady(false); + bool readySignal = outputFifo.getInputReady(); + bool validSignal = receiver.receive_request(); + if (!validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 1 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 14) { + exit(3); + } + if (!outputFifo.getOutputValid()) { + exit(4); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (!simDummy.isOutputValid()) { + exit(1); + } + + simDummy.setNextReady(false); + readySignal = outputFifo.getInputReady(); // Should be true now (FIFO not full) + validSignal = receiver.receive_request(); + if (!validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 2 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 13) { + exit(3); + } + if (!outputFifo.getOutputValid()) { + exit(4); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (!simDummy.isOutputValid()) { + exit(1); + } + + } // Destructor called here + exit(0); + } + + // Parent process: Sender + { + InterprocessCommunicationChannel sender(shmName); + + bool validSignal = true; + bool incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 0; expect ready==true for cycle 1 + incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 1; expect ready==true for cycle 2 + + } // Destructor called here + + // Wait for child + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); +} + + +TEST_F(IntegrationTest, TwoCycleReadyTrueValidTrue) { + // Test: FIFO feeds data to InterprocessCommunicationChannel sender/receiver pair + // Architecture: Sender (process A) -> Receiver -> FIFO (process B) -> SimDummy -> validation + + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver with FIFO output to SimDummy + int receivedCount = 0; + { + InterprocessCommunicationChannel receiver(shmName); + FIFO outputFifo(15); + SimDummy simDummy; + + simDummy.setNextReady(true); + bool readySignal = outputFifo.getInputReady(); + bool validSignal = receiver.receive_request(); + if (!validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 1 STARTS + + // Verify FIFO state and SimDummy + if (outputFifo.getSpaceLeft() != 14) { + exit(3); + } + if (!outputFifo.getOutputValid()) { + exit(4); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (!simDummy.isOutputValid()) { + exit(1); + } + + simDummy.setNextReady(true); + readySignal = outputFifo.getInputReady(); // Should be true now + validSignal = receiver.receive_request(); + if (!validSignal) { + exit(2); + } + receiver.send_response(readySignal); + outputFifo.update(validSignal, simDummy.isInputReady()); + outputFifo.toggleClock(); // BELOW HERE CYCLE 2 STARTS + + // Verify FIFO state and SimDummy - FIFO consumes data because SimDummy is ready + if (outputFifo.getSpaceLeft() != 14) { + exit(3); + } + if (!outputFifo.getOutputValid()) { + exit(4); + } + simDummy.setNextValid(outputFifo.getOutputValid()); + simDummy.toggleClock(); + if (!simDummy.isOutputValid()) { + exit(1); + } + + } // Destructor called here + exit(0); + } + + // Parent process: Sender + { + InterprocessCommunicationChannel sender(shmName); + + bool validSignal = true; + bool incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 0; expect ready==true for cycle 1 + incomingReady = sender.send_request(validSignal); + EXPECT_TRUE(incomingReady); // We are in cycle 1; expect ready==true for cycle 2 + + } // Destructor called here + + // Wait for child + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); +} + + +// ===== Sender Side Integration Tests ===== + +TEST_F(IntegrationTest, SimToFIFO) { + // Architecture: SimDummy -> FIFO + + SimDummy sim; + FIFO fifo(15); + + // Propagate valid through SimDummy + sim.setNextValid(true); + fifo.update(sim.isOutputValid(), false); + EXPECT_TRUE(fifo.getInputReady()); + sim.setNextReady(fifo.getInputReady()); + fifo.toggleClock(); + sim.toggleClock(); + EXPECT_EQ(fifo.size(), 0); + EXPECT_TRUE(sim.isInputReady()); + + // Fill FIFO to capacity + for (std::size_t i = 0; i < 15; ++i) { + sim.setNextValid(true); + + fifo.update(sim.isOutputValid(), false); + EXPECT_TRUE(fifo.getInputReady()); + sim.setNextReady(fifo.getInputReady()); + EXPECT_EQ(fifo.size(), i); + fifo.toggleClock(); + sim.toggleClock(); + EXPECT_EQ(fifo.size(), i + 1); + EXPECT_TRUE(sim.isInputReady()); + } + + EXPECT_FALSE(fifo.getInputReady()); // FIFO changed to not ready on this cycle; Sim is still ready + sim.setNextValid(true); + fifo.update(sim.isOutputValid(), false); + EXPECT_FALSE(fifo.getInputReady()); + sim.setNextReady(fifo.getInputReady()); + fifo.toggleClock(); + sim.toggleClock(); // Propagate ready false through sim + + EXPECT_EQ(fifo.size(), 15); + EXPECT_FALSE(sim.isInputReady()); +} diff --git a/finn_xsi/finn_xsi/unittests/InterprocessCommunicationChannel_test.cpp b/finn_xsi/finn_xsi/unittests/InterprocessCommunicationChannel_test.cpp new file mode 100644 index 0000000000..8952c6e002 --- /dev/null +++ b/finn_xsi/finn_xsi/unittests/InterprocessCommunicationChannel_test.cpp @@ -0,0 +1,1163 @@ +#include "InterprocessCommunicationChannel.hpp" + +#include +#include +#include + +#include +#include + +// Simple request/response types for testing +struct TestRequest { + int value; + bool flag; + + TestRequest() : value(0), flag(false) {} + TestRequest(int v, bool f) : value(v), flag(f) {} + + bool operator==(const TestRequest& other) const { return value == other.value && flag == other.flag; } +}; + +struct TestResponse { + int result; + bool success; + + TestResponse() : result(0), success(false) {} + TestResponse(int r, bool s) : result(r), success(s) {} + + bool operator==(const TestResponse& other) const { return result == other.result && success == other.success; } +}; + +// Test fixture for InterprocessCommunicationChannel tests +class InterprocessCommunicationChannelTest : public ::testing::Test { + protected: + void SetUp() override { + // Generate unique shared memory name for each test + shmName = "test_ipc_" + std::to_string(getpid()) + "_" + std::to_string(std::chrono::steady_clock::now().time_since_epoch().count()); + } + + void TearDown() override { + // Cleanup: ensure shared memory is removed + boost::interprocess::shared_memory_object::remove(shmName.c_str()); + } + + std::string shmName; +}; + +// ===== Constructor and Initialization Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, SenderConstructorCreatesSharedMemory) { + InterprocessCommunicationChannel sender(shmName); + + // Verify that shared memory exists + bool shmExists = false; + try { + boost::interprocess::managed_shared_memory shmem(boost::interprocess::open_only, shmName.c_str()); + shmExists = true; + } catch (...) { shmExists = false; } + + EXPECT_TRUE(shmExists); +} + +TEST_F(InterprocessCommunicationChannelTest, ReceiverWaitsForSenderToCreateSharedMemory) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver (waits for sender) + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + InterprocessCommunicationChannel receiver(shmName); + exit(0); + } else { + // Parent process: Sender (creates shared memory) + InterprocessCommunicationChannel sender(shmName); + + // Wait for child to complete + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, DefaultConstructorCreatesUninitializedObject) { + InterprocessCommunicationChannel channel; + // Should not crash - object is in moved-from state + // Destructor should handle this gracefully +} + +TEST_F(InterprocessCommunicationChannelTest, MoveConstructorTransfersOwnership) { + InterprocessCommunicationChannel sender1(shmName); + InterprocessCommunicationChannel sender2(std::move(sender1)); + + // sender2 should now own the shared memory + // sender1 should be in moved-from state (destructor shouldn't crash) +} + +TEST_F(InterprocessCommunicationChannelTest, MoveAssignmentTransfersOwnership) { + InterprocessCommunicationChannel sender1(shmName); + InterprocessCommunicationChannel sender2; + + sender2 = std::move(sender1); + + // sender2 should now own the shared memory + // sender1 should be in moved-from state +} + +// ===== Single Request-Response Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, SingleRequestResponseExchange) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender sends request, waits for response + InterprocessCommunicationChannel sender(shmName); + + TestRequest req(42, true); + TestResponse resp = sender.send_request(req); + + // Verify response + exit((resp.result == 84 && resp.success) ? 0 : 1); + } else { + // Parent process: Receiver waits for request, sends response + InterprocessCommunicationChannel receiver(shmName); + + // Small delay to ensure both processes are ready + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, 42); + EXPECT_TRUE(req.flag); + + // Send response (double the request value) + TestResponse resp(req.value * 2, true); + receiver.send_response(resp); + + // Wait for child and check result + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, RequestResponseWithDifferentValues) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender + InterprocessCommunicationChannel sender(shmName); + + TestRequest req(100, false); + TestResponse resp = sender.send_request(req); + + exit((resp.result == 200 && !resp.success) ? 0 : 1); + } else { + // Parent process: Receiver + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, 100); + EXPECT_FALSE(req.flag); + + TestResponse resp(req.value * 2, req.flag); + receiver.send_response(resp); + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, SingleSplitJoinRequest) { + // Test that a diamond pattern of communication works + pid_t p1 = fork(); + pid_t p2 = fork(); + pid_t p3 = fork(); + std::string leftName = shmName + "_left_in"; + std::string rightName = shmName + "_right_in"; + std::string leftOutName = shmName + "_left_out"; + std::string rightOutName = shmName + "_right_out"; + + if (p1 != 0 && p2 != 0 && p3 != 0) { + // Parent (origin) + InterprocessCommunicationChannel originToLeft(leftName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + InterprocessCommunicationChannel originToRight(rightName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + originToLeft.handshake(); + originToRight.handshake(); + + // Send message to the left + TestRequest reqLeft(100, false); + TestResponse respLeft = originToLeft.send_request(reqLeft); + EXPECT_EQ(respLeft.result, 600); + + // Send message to the right + TestRequest reqRight(130, false); + TestResponse respRight = originToRight.send_request(reqRight); + EXPECT_EQ(respRight.result, 780); + std::cout << "Origin done." << std::endl; + + } else if (p1 == 0 && p2 != 0 && p3 != 0) { + // P1 (Left) + InterprocessCommunicationChannel fromOrigin(leftName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + InterprocessCommunicationChannel toEnd(leftOutName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + fromOrigin.handshake(); + toEnd.handshake(); + + // Receive from origin + TestRequest req = fromOrigin.receive_request(); + EXPECT_EQ(req.value, 100); + EXPECT_FALSE(req.flag); + + // Forward triple + TestRequest reqForward(req.value * 3, req.flag); + TestResponse resp = toEnd.send_request(reqForward); + auto expectedResponseFromEnd = req.value * 2 * 3; + EXPECT_EQ(resp.result, expectedResponseFromEnd); + + // Answer with value from end + TestResponse respOrigin(resp.result, resp.success); + fromOrigin.send_response(respOrigin); + + std::cout << "Left done." << std::endl; + exit((req.value == 100 && resp.result == expectedResponseFromEnd) ? 0 : 1); + + } else if (p1 != 0 && p2 == 0 && p3 != 0) { + // P2 (Right) + InterprocessCommunicationChannel fromOrigin(rightName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + InterprocessCommunicationChannel toEnd(rightOutName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + fromOrigin.handshake(); + toEnd.handshake(); + + // Receive from origin + TestRequest req = fromOrigin.receive_request(); + EXPECT_EQ(req.value, 130); + EXPECT_FALSE(req.flag); + + // Forward triple + TestRequest reqForward(req.value * 3, req.flag); + TestResponse resp = toEnd.send_request(reqForward); + auto expectedResponseFromEnd = req.value * 2 * 3; + EXPECT_EQ(resp.result, expectedResponseFromEnd); + + // Answer with value from end + TestResponse respOrigin(resp.result, resp.success); + fromOrigin.send_response(respOrigin); + + std::cout << "Right done." << std::endl; + exit((req.value == 130 && resp.result == expectedResponseFromEnd) ? 0 : 1); + + } else if (p1 != 0 && p2 != 0 && p3 == 0) { + // End + InterprocessCommunicationChannel endLeft(leftOutName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + InterprocessCommunicationChannel endRight(rightOutName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + endLeft.handshake(); + endRight.handshake(); + + // Receive and return double + TestRequest reqLeft = endLeft.receive_request(); + EXPECT_EQ(reqLeft.value, 300); + TestResponse respLeft(reqLeft.value * 2, true); + endLeft.send_response(respLeft); + + // Receive and return double + TestRequest reqRight = endRight.receive_request(); + EXPECT_EQ(reqRight.value, 390); + TestResponse respRight(reqRight.value * 2, true); + endRight.send_response(respRight); + + std::cout << "End done." << std::endl; + exit((reqLeft.value == 300 && reqRight.value == 390) ? 0 : 1); + } + + // Wait for all forks to shut down + if (p1 != 0 && p2 != 0 && p3 != 0) { + int status; + waitpid(p1, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + waitpid(p2, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + waitpid(p3, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } + +} + +// ===== Multiple Request-Response Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, MultipleRequestResponseSequential) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender sends multiple requests + InterprocessCommunicationChannel sender(shmName); + + for (int i = 0; i < 10; ++i) { + TestRequest req(i, i % 2 == 0); + TestResponse resp = sender.send_request(req); + + // Verify response matches expected calculation + if (resp.result != i * 3 || resp.success != (i % 2 == 0)) { + exit(1); + } + } + exit(0); + } else { + // Parent process: Receiver processes multiple requests + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + for (int i = 0; i < 10; ++i) { + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, i); + EXPECT_EQ(req.flag, i % 2 == 0); + + // Send calculated response + TestResponse resp(req.value * 3, req.flag); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, ManyRequestResponseExchanges) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender + InterprocessCommunicationChannel sender(shmName); + + for (int i = 0; i < 1000; ++i) { + TestRequest req(i % 100, i % 3 == 0); + TestResponse resp = sender.send_request(req); + + // Verify response + int expected = (i % 100) + 10; + if (resp.result != expected) { + exit(1); + } + } + exit(0); + } else { + // Parent process: Receiver + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + for (int i = 0; i < 1000; ++i) { + TestRequest req = receiver.receive_request(); + + // Just verify exchange completes without deadlock + int expected_val = i % 100; + EXPECT_EQ(req.value, expected_val); + + TestResponse resp(req.value + 10, true); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, AlternatingRequestPattern) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender with alternating pattern + InterprocessCommunicationChannel sender(shmName); + + for (int i = 0; i < 100; ++i) { + bool flag = (i % 2 == 0); + TestRequest req(i, flag); + TestResponse resp = sender.send_request(req); + + // Verify response + if (resp.result != i * 2 || resp.success != flag) { + exit(1); + } + } + exit(0); + } else { + // Parent process: Receiver + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + for (int i = 0; i < 100; ++i) { + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, i); + EXPECT_EQ(req.flag, i % 2 == 0); + + TestResponse resp(req.value * 2, req.flag); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +// ===== Buffer Flipping Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, BufferFlipsCorrectly) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender + InterprocessCommunicationChannel sender(shmName); + + // Perform multiple exchanges to trigger buffer flips + for (int i = 0; i < 20; ++i) { + TestRequest req(i, true); + TestResponse resp = sender.send_request(req); + + if (resp.result != i + 1) { + exit(1); + } + } + exit(0); + } else { + // Parent process: Receiver + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + // Perform multiple exchanges - buffer should flip multiple times + for (int i = 0; i < 20; ++i) { + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, i); + + TestResponse resp(req.value + 1, true); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +// ===== Stress Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, HighFrequencyExchanges) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender - rapid exchanges + InterprocessCommunicationChannel sender(shmName); + + for (int i = 0; i < 10000; ++i) { + TestRequest req(i & 0xFF, i & 1); + TestResponse resp = sender.send_request(req); + + if (resp.result != (i & 0xFF) * 2) { + exit(1); + } + } + exit(0); + } else { + // Parent process: Receiver - rapid exchanges + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + for (int i = 0; i < 10000; ++i) { + TestRequest req = receiver.receive_request(); + + TestResponse resp(req.value * 2, req.flag); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, StressTestWithComplexPattern) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender with complex pattern + InterprocessCommunicationChannel sender(shmName); + + for (int i = 0; i < 5000; ++i) { + int val = (i * 7) % 127; + bool flag = ((i * 11) % 13) < 6; + TestRequest req(val, flag); + TestResponse resp = sender.send_request(req); + + if (resp.result != val + 5) { + exit(1); + } + } + exit(0); + } else { + // Parent process: Receiver with response calculation + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + for (int i = 0; i < 5000; ++i) { + TestRequest req = receiver.receive_request(); + + TestResponse resp(req.value + 5, req.flag); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +// ===== Reference Counting Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, ReferenceCountingTwoProcesses) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Create sender and let it go out of scope + { + InterprocessCommunicationChannel sender(shmName); + TestRequest req(1, true); + sender.send_request(req); + } + + // Shared memory should still exist because parent still holds reference + bool shmExists = false; + try { + boost::interprocess::managed_shared_memory shmem(boost::interprocess::open_only, shmName.c_str()); + shmExists = true; + } catch (...) { shmExists = false; } + + exit(shmExists ? 0 : 1); + } else { + // Parent process: Keep receiver alive + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + TestRequest req = receiver.receive_request(); + TestResponse resp(req.value, req.flag); + receiver.send_response(resp); + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, SharedMemoryCleanupAfterBothProcessesExit) { + // This test verifies that shared memory is properly cleaned up + // when both processes exit. + + pid_t verifier_pid = fork(); + + if (verifier_pid == 0) { + // Verifier process: spawns two children and then checks cleanup + pid_t sender_pid = fork(); + + if (sender_pid == 0) { + // First child: Sender + // Use block scope so destructor is called before exit + { + InterprocessCommunicationChannel sender(shmName); + TestRequest req(42, true); + sender.send_request(req); + } // Destructor called here + exit(0); + } + + // Small delay to ensure sender creates shared memory + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + pid_t receiver_pid = fork(); + if (receiver_pid == 0) { + // Second child: Receiver + // Use block scope so destructor is called before exit + { + InterprocessCommunicationChannel receiver(shmName); + TestRequest req = receiver.receive_request(); + TestResponse resp(req.value, req.flag); + receiver.send_response(resp); + } // Destructor called here + exit(0); + } + + // Wait for both children to complete + int sender_status, receiver_status; + waitpid(sender_pid, &sender_status, 0); + waitpid(receiver_pid, &receiver_status, 0); + + // Give time for cleanup to complete + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Verify shared memory is cleaned up + bool shmExists = false; + try { + boost::interprocess::managed_shared_memory shmem(boost::interprocess::open_only, shmName.c_str()); + shmExists = true; + } catch (...) { shmExists = false; } + + // Exit with 0 if cleanup succeeded (shmExists == false) + exit(shmExists ? 1 : 0); + } else { + // Parent: Wait for verifier process + int status; + waitpid(verifier_pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +// ===== Move Semantics Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, MoveConstructorMaintainsConnection) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender with move + InterprocessCommunicationChannel sender1(shmName); + InterprocessCommunicationChannel sender2(std::move(sender1)); + + TestRequest req(99, false); + TestResponse resp = sender2.send_request(req); + + exit((resp.result == 99 && !resp.success) ? 0 : 1); + } else { + // Parent process: Receiver + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, 99); + EXPECT_FALSE(req.flag); + + TestResponse resp(req.value, req.flag); + receiver.send_response(resp); + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, MoveAssignmentMaintainsConnection) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender with move assignment + InterprocessCommunicationChannel sender1(shmName); + InterprocessCommunicationChannel sender2; + sender2 = std::move(sender1); + + TestRequest req(77, true); + TestResponse resp = sender2.send_request(req); + + exit((resp.result == 77 && resp.success) ? 0 : 1); + } else { + // Parent process: Receiver + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, 77); + EXPECT_TRUE(req.flag); + + TestResponse resp(req.value, req.flag); + receiver.send_response(resp); + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +// ===== Edge Cases ===== + +TEST_F(InterprocessCommunicationChannelTest, FirstCallBehavior) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender - first call should not wait for buffer flip + InterprocessCommunicationChannel sender(shmName); + + auto start = std::chrono::steady_clock::now(); + TestRequest req(1, true); + sender.send_request(req); + auto end = std::chrono::steady_clock::now(); + + // First call should complete quickly (not waiting for previous flip) + auto duration = std::chrono::duration_cast(end - start); + exit(duration.count() < 100 ? 0 : 1); + } else { + // Parent process: Receiver + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + TestRequest req = receiver.receive_request(); + TestResponse resp(req.value, req.flag); + receiver.send_response(resp); + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, ConsecutiveRequestsSameValue) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Send same request repeatedly + InterprocessCommunicationChannel sender(shmName); + + for (int i = 0; i < 50; ++i) { + TestRequest req(123, true); + TestResponse resp = sender.send_request(req); + + if (resp.result != 123 || !resp.success) { + exit(1); + } + } + exit(0); + } else { + // Parent process: Verify same request received repeatedly + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + for (int i = 0; i < 50; ++i) { + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, 123); + EXPECT_TRUE(req.flag); + + TestResponse resp(req.value, req.flag); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +// ===== Timing and Synchronization Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, SynchronizationBetweenProcesses) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender - delayed start + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + InterprocessCommunicationChannel sender(shmName); + + for (int i = 0; i < 10; ++i) { + TestRequest req(i, true); + sender.send_request(req); + } + exit(0); + } else { + // Parent process: Receiver - starts immediately + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + // Should wait for sender to be ready + for (int i = 0; i < 10; ++i) { + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, i); + + TestResponse resp(req.value, req.flag); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +// ===== Custom Shared Memory Size Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, CustomSharedMemorySize) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender with larger shared memory + InterprocessCommunicationChannel sender(shmName); + + TestRequest req(55, false); + TestResponse resp = sender.send_request(req); + + exit((resp.result == 55) ? 0 : 1); + } else { + // Parent process: Receiver with larger shared memory + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + TestRequest req = receiver.receive_request(); + EXPECT_EQ(req.value, 55); + + TestResponse resp(req.value, req.flag); + receiver.send_response(resp); + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +// ===== Stop Token Tests ===== + +TEST_F(InterprocessCommunicationChannelTest, SenderCancellationViaStopToken) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender that gets cancelled while waiting for response + InterprocessCommunicationChannel sender(shmName); + + std::stop_source stop_src; + std::jthread canceller([&stop_src]() { + // Cancel after 100ms + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + stop_src.request_stop(); + }); + + TestRequest req(42, true); + auto start = std::chrono::steady_clock::now(); + TestResponse resp = sender.send_request(req, stop_src.get_token()); + auto end = std::chrono::steady_clock::now(); + + auto duration = std::chrono::duration_cast(end - start); + + // Should return default response quickly (within 200ms, accounting for scheduling) + // and not wait indefinitely for the receiver that never responds + exit((duration.count() < 200 && resp.result == 0 && !resp.success) ? 0 : 1); + } else { + // Parent process: Sender creates shared memory but receiver never responds + InterprocessCommunicationChannel dummy_sender(shmName); + + // Wait for child to complete + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, ReceiverCancellationViaStopToken) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver that gets cancelled while waiting for request + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + InterprocessCommunicationChannel receiver(shmName); + + std::stop_source stop_src; + std::jthread canceller([&stop_src]() { + // Cancel after 100ms + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + stop_src.request_stop(); + }); + + auto start = std::chrono::steady_clock::now(); + TestRequest req = receiver.receive_request(stop_src.get_token()); + auto end = std::chrono::steady_clock::now(); + + auto duration = std::chrono::duration_cast(end - start); + + // Should return default request quickly (within 200ms) + // and not wait indefinitely for a request that never comes + exit((duration.count() < 200 && req.value == 0 && !req.flag) ? 0 : 1); + } else { + // Parent process: Sender creates shared memory but never sends request + InterprocessCommunicationChannel sender(shmName); + + // Wait for child to complete + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, StopTokenDoesNotInterruptNormalOperation) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender with stop token that is never triggered + InterprocessCommunicationChannel sender(shmName); + + std::stop_source stop_src; + + TestRequest req(99, true); + TestResponse resp = sender.send_request(req, stop_src.get_token()); + + // Should complete normally and receive proper response + exit((resp.result == 198 && resp.success) ? 0 : 1); + } else { + // Parent process: Receiver responds normally + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + std::stop_source stop_src; + TestRequest req = receiver.receive_request(stop_src.get_token()); + EXPECT_EQ(req.value, 99); + EXPECT_TRUE(req.flag); + + TestResponse resp(req.value * 2, req.flag); + receiver.send_response(resp); + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, MultipleExchangesWithStopToken) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender performs multiple exchanges with stop token + InterprocessCommunicationChannel sender(shmName); + + std::stop_source stop_src; + + for (int i = 0; i < 50; ++i) { + TestRequest req(i, i % 2 == 0); + TestResponse resp = sender.send_request(req, stop_src.get_token()); + + if (resp.result != i * 2 || resp.success != (i % 2 == 0)) { + exit(1); + } + } + exit(0); + } else { + // Parent process: Receiver with stop token + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + std::stop_source stop_src; + + for (int i = 0; i < 50; ++i) { + TestRequest req = receiver.receive_request(stop_src.get_token()); + EXPECT_EQ(req.value, i); + + TestResponse resp(req.value * 2, req.flag); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, SenderCancellationMidExchange) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender that gets cancelled after some exchanges + InterprocessCommunicationChannel sender(shmName); + + std::stop_source stop_src; + + // Perform a few successful exchanges + for (int i = 0; i < 5; ++i) { + TestRequest req(i, true); + TestResponse resp = sender.send_request(req, stop_src.get_token()); + + if (resp.result != i * 2) { + exit(1); + } + } + + // Now trigger cancellation for next exchange + std::jthread canceller([&stop_src]() { + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + stop_src.request_stop(); + }); + + // This should be cancelled (receiver won't respond in time) + TestRequest req(100, false); + auto start = std::chrono::steady_clock::now(); + TestResponse resp = sender.send_request(req, stop_src.get_token()); + auto end = std::chrono::steady_clock::now(); + + auto duration = std::chrono::duration_cast(end - start); + + // Should return default response due to cancellation + exit((duration.count() < 200 && resp.result == 0) ? 0 : 1); + } else { + // Parent process: Receiver responds to first 5 requests, then delays + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + std::stop_source stop_src; + + // Respond to first 5 requests normally + for (int i = 0; i < 5; ++i) { + TestRequest req = receiver.receive_request(stop_src.get_token()); + TestResponse resp(req.value * 2, req.flag); + receiver.send_response(resp); + } + + // Delay before processing the 6th request (which will be cancelled) + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + // Try to receive next request (might be cancelled) + TestRequest req = receiver.receive_request(stop_src.get_token()); + if (req.value == 100) { + TestResponse resp(req.value * 2, req.flag); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, ReceiverCancellationMidExchange) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Receiver that gets cancelled after some exchanges + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + InterprocessCommunicationChannel receiver(shmName); + + std::stop_source stop_src; + + // Perform a few successful exchanges + for (int i = 0; i < 5; ++i) { + TestRequest req = receiver.receive_request(stop_src.get_token()); + + if (req.value != i) { + exit(1); + } + + TestResponse resp(req.value * 2, req.flag); + receiver.send_response(resp); + } + + // Now trigger cancellation for next receive + std::jthread canceller([&stop_src]() { + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + stop_src.request_stop(); + }); + + // This should be cancelled (sender will delay) + auto start = std::chrono::steady_clock::now(); + TestRequest req = receiver.receive_request(stop_src.get_token()); + auto end = std::chrono::steady_clock::now(); + + auto duration = std::chrono::duration_cast(end - start); + + // Should return default request due to cancellation + exit((duration.count() < 200 && req.value == 0) ? 0 : 1); + } else { + // Parent process: Sender sends first 5 requests, then delays + InterprocessCommunicationChannel sender(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + std::stop_source stop_src; + + // Send first 5 requests normally + for (int i = 0; i < 5; ++i) { + TestRequest req(i, true); + TestResponse resp = sender.send_request(req, stop_src.get_token()); + + if (resp.result != i * 2) { + // Unexpected response + break; + } + } + + // Delay before sending the 6th request (receiver will be cancelled) + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, ImmediateCancellation) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender with immediately stopped token + InterprocessCommunicationChannel sender(shmName); + + std::stop_source stop_src; + stop_src.request_stop(); // Stop immediately + + TestRequest req(42, true); + auto start = std::chrono::steady_clock::now(); + TestResponse resp = sender.send_request(req, stop_src.get_token()); + auto end = std::chrono::steady_clock::now(); + + auto duration = std::chrono::duration_cast(end - start); + + // Should return immediately with default response + exit((duration.count() < 50 && resp.result == 0 && !resp.success) ? 0 : 1); + } else { + // Parent process: Just creates sender + InterprocessCommunicationChannel dummy_sender(shmName); + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_F(InterprocessCommunicationChannelTest, StopTokenWithHighFrequencyExchanges) { + pid_t pid = fork(); + + if (pid == 0) { + // Child process: Sender with many rapid exchanges using stop token + InterprocessCommunicationChannel sender(shmName); + + std::stop_source stop_src; + + for (int i = 0; i < 100; ++i) { + TestRequest req(i % 10, i % 2 == 0); + TestResponse resp = sender.send_request(req, stop_src.get_token()); + + if (resp.result != (i % 10) * 3) { + exit(1); + } + } + exit(0); + } else { + // Parent process: Receiver with stop token + InterprocessCommunicationChannel receiver(shmName); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + std::stop_source stop_src; + + for (int i = 0; i < 100; ++i) { + TestRequest req = receiver.receive_request(stop_src.get_token()); + + TestResponse resp(req.value * 3, req.flag); + receiver.send_response(resp); + } + + int status; + waitpid(pid, &status, 0); + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +// Main function to run all tests +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/finn_xsi/finn_xsi/xsi_bind.cpp b/finn_xsi/finn_xsi/xsi_bind.cpp index 6530c84358..1edf80b01b 100644 --- a/finn_xsi/finn_xsi/xsi_bind.cpp +++ b/finn_xsi/finn_xsi/xsi_bind.cpp @@ -8,8 +8,11 @@ * @author Thomas B. Preußer ***************************************************************************/ +#include +#include +#include + #include -#include "xsi_finn.hpp" #include #include @@ -31,11 +34,6 @@ namespace { PYBIND11_MODULE(xsi, m) { - py::class_>(m, "Kernel") - .def(py::init()) - .def("hex_in_lower", &Kernel::hex_in_lower) - .def("hex_in_upper", &Kernel::hex_in_upper); - py::class_>(m, "Design") .def(py::init([]( std::shared_ptr const &kernel, @@ -54,7 +52,7 @@ PYBIND11_MODULE(xsi, m) { .def("get_status", &Design::get_status) .def("get_error_info", &Design::get_error_info) .def("num_ports", &Design::num_ports) - .def("getPort", static_cast(&Design::getPort)) + .def("getPort", static_cast(&Design::getPort)) .def("ports", [](Design &d) { auto const e = d.ports(); return py::make_iterator(e.begin(), e.end()); diff --git a/finn_xsi/finn_xsi/xsi_finn.cpp b/finn_xsi/finn_xsi/xsi_finn.cpp deleted file mode 100644 index 19134ac988..0000000000 --- a/finn_xsi/finn_xsi/xsi_finn.cpp +++ /dev/null @@ -1,346 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2025, Advanced Micro Devices, Inc. - * All rights reserved. - * - * SPDX-License-Identifier: BSD-3-Clause - * - * @brief FINN XSI++: C++ XSI Binding used by FINN. - * @author Thomas B. Preußer - ***************************************************************************/ - -#include "xsi_finn.hpp" - -#include -#include - - -using namespace xsi; - -//=========================================================================== -// Local Helpers - -namespace { - void* resolve_or_throw(SharedLibrary &lib, char const *const sym) { - auto const res = lib.getsymbol(sym); - if(!res) { - throw std::runtime_error( - std::string("Failed to resolve ") - .append(sym).append(" in ").append(lib.path()) - ); - } - return *res; - } - char XZ10[4] = { '0', '1', 'Z', 'X' }; - char HEX[16] = { - '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' - }; -} - -void Kernel::hex_in_lower() { - for(unsigned i = 2; i < 4; i++) XZ10[i] |= ' '; - for(unsigned i = 10; i < 16; i++) HEX [i] |= ' '; -} -void Kernel::hex_in_upper() { - for(unsigned i = 2; i < 4; i++) XZ10[i] &= ~' '; - for(unsigned i = 10; i < 16; i++) HEX [i] &= ~' '; -} - -//=========================================================================== -// Shared Library Representation - -char const SharedLibrary::library_suffix[] = -#if defined(_WIN32) - ".lib"; -#else - ".so"; -#endif - -#if defined(_WIN32) -namespace { - std::string translate_error_message(DWORD errid) { - std::string msg; - LPTSTR bufptr; - FormatMessage( - FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - nullptr, - errid, - MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), - &bufptr, - 0, nullptr - ); - if(bufptr) msg = reinterpret_cast(bufptr); - LocalFree(bufptr); - return msg; - } -} -#endif - -SharedLibrary& SharedLibrary::open(std::string const &path) { - if(_lib) throw std::runtime_error("SharedLibrary still open for " + _path); - _lib = load(path); - _path = path; - return *this; -} - -SharedLibrary::handle_type SharedLibrary::load(std::string const &path) { - if(path.empty()) throw std::domain_error("Empty library path."); - -#if defined(_WIN32) - SetLastError(0); -#ifdef UNICODE - // Use LoadLibraryA explicitly on windows if UNICODE is defined - handle_type const lib = LoadLibraryA(path.c_str()); -#else - handle_type const lib = LoadLibrary(path.c_str()); -#endif - if(!lib) throw std::runtime_error(translate_error_message(GetLastError())); -#else - handle_type const lib = dlopen(path.c_str(), RTLD_LAZY | RTLD_GLOBAL); - if(!lib) throw std::runtime_error(dlerror()); -#endif - return lib; -} - -void SharedLibrary::unload() { - if(_lib) { -#if defined(_WIN32) - FreeLibrary(_lib); -#else - dlclose(_lib); -#endif - } -} - -std::optional SharedLibrary::getsymbol(char const *const name) { - void *sym; -#if defined(_WIN32) - sym = (void*)GetProcAddress(_lib, name); - if(!sym) -#else - dlerror(); // clear error - sym = dlsym(_lib, name); - char const *const err = dlerror(); - if(err) -#endif - return std::nullopt; - return std::make_optional(sym); -} - -//=========================================================================== -// xsi::Kernel - -char const *const Kernel::Xsi::FUNC_NAMES[EXTENT] = { - "xsi_get_value", "xsi_put_value", - "xsi_get_int_port", "xsi_get_str_port", - - "xsi_get_int", "xsi_get_port_number", - - "xsi_trace_all", "xsi_run", "xsi_restart", - "xsi_get_status", "xsi_get_error_info", - - "xsi_close" -}; - -#include -inline Kernel::Xsi::Xsi(SharedLibrary &lib) : _hdl(nullptr) { - // Resolve XSI Functions - for(unsigned i = 0; i < EXTENT; i++) { - _func[i] = resolve_or_throw(lib, FUNC_NAMES[i]); - } -} - -//--------------------------------------------------------------------------- -// Life Cycle -Kernel::Kernel(std::string const &kernel_lib) : _kernel_lib(kernel_lib), _xsi(_kernel_lib) {} - -Kernel::~Kernel() { - if(_design_lib) std::cerr << "Disposing XSI Kernel with open Design." << std::endl; -} - -void Kernel::open(std::string const &design_lib, s_xsi_setup_info const &setup_info) { - _design_lib.open(design_lib); - try { - auto const f = t_fp_xsi_open(resolve_or_throw(_design_lib, "xsi_open")); - xsiHandle const hdl = f(const_cast(&setup_info)); - if(!hdl) throw std::runtime_error("Loading of design failed"); - _xsi.setHandle(hdl); - - // Enumerate Ports - unsigned const port_count = xsi(xsiNumTopPorts); - std::unique_ptr ports { new Port[port_count] }; - for(unsigned i = 0; i < port_count; i++) new(&ports[i]) Port(*this, i); - _port_count = port_count; - _ports = std::move(ports); - } - catch(...) { - _design_lib.close(); - throw; - } -} -void Kernel::close() noexcept { - xsi(); - _xsi.setHandle(nullptr); - _design_lib.close(); - _ports.reset(); - - // Clean up Library State - std::optional const vptr = _kernel_lib.getsymbol("svTypeInfo"); - if(vptr) *((void**)*vptr) = nullptr; -} - -//=========================================================================== -// xsi::Port - -bool Port::hasUnknown() const { - unsigned const n = (width()+31) / 32; - s_xsi_vlog_logicval const *const p = buf(); - for(unsigned i = 0; i < n; i++) { - if(p[i].bVal) return true; - } - return false; -} - -bool Port::isZero() const { - unsigned const n = (width()+31) / 32; - s_xsi_vlog_logicval const *const p = buf(); - for(unsigned i = 0; i < n; i++) { - if(p[i].aVal) return false; - } - return true; -} - -std::string Port::as_binstr() const { - unsigned const w = width(); - std::string res(w, '?'); - - s_xsi_vlog_logicval const *si = buf(); - std::string::iterator di = res.end(); -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" - uint32_t a; - uint32_t b; - for(unsigned i = 0; i < w; i++) { - if((i & 31) == 0) { - a = si->aVal; - b = si->bVal; - si++; - } - *--di = XZ10[((b&1)<<1)|(a&1)]; - a >>= 1; - b >>= 1; - } -#pragma GCC diagnostic pop - return res; -} - -std::string Port::as_hexstr() const { - unsigned l = (width()+3)/4; - std::string res(l, '?'); - s_xsi_vlog_logicval const *si = buf(); - std::string::iterator di = res.end(); - - while(l > 0) { - uint32_t a = si->aVal; - uint32_t b = si->bVal; - si++; - - unsigned m = std::min(8u, l); - l -= m; - do { - unsigned const bm = b & 0xF; - unsigned const am = a & 0xF; - - *--di = !bm? HEX[am] : XZ10[3 - !(am&bm)]; - a >>= 4; - b >>= 4; - } - while(--m > 0); - } - return res; -} - -Port& Port::clear() { - unsigned const n = (width()+31) / 32; - s_xsi_vlog_logicval *const p = buf(); - std::fill(p, p+n, s_xsi_vlog_logicval { .aVal = 0u, .bVal = 0u }); - return *this; -} - -Port& Port::set_binstr(std::string const &val) { - std::string::const_iterator si = val.end(); - s_xsi_vlog_logicval *di = buf(); - - unsigned const n = (width()+31) / 32; - unsigned l = val.length(); - for(unsigned i = 0; i < n; i++) { - uint32_t a = 0; - uint32_t b = 0; - - unsigned const m = std::min(32u, l); - l -= m; - si -= m; - for(unsigned j = 0; j < m; j++) { - a <<= 1; - b <<= 1; - switch(*si++) { - case '1': - a |= 1; - case '0': - continue; - - default: - a |= 1; - case 'Z': - case 'z': - b |= 1; - continue; - } - } - si -= m; - - di->aVal = a; - di->bVal = b; - di++; - } - - return *this; -} - -Port& Port::set_hexstr(std::string const &val) { - std::string::const_iterator si = val.end(); - s_xsi_vlog_logicval *di = buf(); - - unsigned const n = (width()+31) / 32; - unsigned l = val.length(); - for(unsigned i = 0; i < n; i++) { - uint32_t a = 0; - uint32_t b = 0; - - unsigned const m = std::min(8u, l); - l -= m; - si -= m; - for(unsigned j = 0; j < m; j++) { - char c = *si++; - a <<= 4; - b <<= 4; - - if(('0' <= c) && c <= '9') a |= c & 0xF; - else { - c |= 0x20; - if(('a' <= c) && (c <= 'f')) a |= c - ('a'-10); - else { - b |= 0xF; - if(c != 'z') a |= 0xF; - } - } - } - si -= m; - - di->aVal = a; - di->bVal = b; - di++; - } - - return *this; -} diff --git a/finn_xsi/finn_xsi/xsi_finn.hpp b/finn_xsi/finn_xsi/xsi_finn.hpp deleted file mode 100644 index 4268657aef..0000000000 --- a/finn_xsi/finn_xsi/xsi_finn.hpp +++ /dev/null @@ -1,356 +0,0 @@ -/**************************************************************************** - * Copyright (C) 2025, Advanced Micro Devices, Inc. - * All rights reserved. - * - * SPDX-License-Identifier: BSD-3-Clause - * - * @brief FINN XSI++: C++ XSI Binding used by FINN. - * @author Thomas B. Preußer - ***************************************************************************/ -#ifndef XSI_FINN_HPP -#define XSI_FINN_HPP - -#include -#include -#include -#include -#include - -#include -#include - -#if defined(_WIN32) -# include -#else -# include -#endif - -#include "xsi.h" - - -namespace xsi { - -//=========================================================================== -// Shared Library Representation - -class SharedLibrary { -public: - static char const library_suffix[]; - -private: - using handle_type = -#if defined(_WIN32) - HINSTANCE; -#else - void*; -#endif - - //----------------------------------------------------------------------- - // Instance State -private: - handle_type _lib; - std::string _path; - - //----------------------------------------------------------------------- - // Life Cycle -public: - SharedLibrary() : _lib(nullptr), _path() {} - SharedLibrary(std::string const &path) : _lib(load(path)), _path(path) {} - ~SharedLibrary() { unload(); } - -private: - SharedLibrary(SharedLibrary const&) = delete; - SharedLibrary& operator=(SharedLibrary const&) = delete; - -public: - operator bool() const { return bool(_lib); } - SharedLibrary& open(std::string const &path); - SharedLibrary& close() { - unload(); - _lib = nullptr; - _path.clear(); - return *this; - } - -private: - static handle_type load(std::string const &path); - void unload(); - - //----------------------------------------------------------------------- - // Accessors -public: - std::string const& path() const { return _path; } - std::optional getsymbol(char const *const name); - -}; // class SharedLibrary - -//=========================================================================== -// xsi::Kernel - -template -class enumerator { - It _begin; - It _end; -public: - enumerator(It begin, It end) : _begin(begin), _end(end) {} - ~enumerator() {} -public: - It begin() const { return _begin; } - It end() const { return _end; } -}; - -class Design; -class Port; -class Kernel { - - //----------------------------------------------------------------------- - // Dispatch Table for XSI Functions - class Xsi { - //- Statics --------------------- - public: - // Function Indeces - static constexpr unsigned - get_value = 0, put_value = 1, - get_int_port = 2, get_str_port = 3, - - get_int = 4, get_port_number = 5, - - trace_all = 6, run = 7, restart = 8, - get_status = 9, get_error_info = 10, - - close = 11; - - private: - // Function Names & Types - static constexpr unsigned EXTENT = 12; - static char const *const FUNC_NAMES[EXTENT]; - using type_map = std::tuple< - // Port Access - t_fp_xsi_get_value, t_fp_xsi_put_value, - t_fp_xsi_get_int_port, t_fp_xsi_get_str_port, - - // Design Inspection - t_fp_xsi_get_int, t_fp_xsi_get_port_number, - - // Simulation Control & Status - t_fp_xsi_trace_all, t_fp_xsi_run, t_fp_xsi_restart, - t_fp_xsi_get_status, t_fp_xsi_get_error_info, - - // Closing - t_fp_xsi_close - >; - - //- Actual Contents ------------- - private: - xsiHandle _hdl; - void* _func[EXTENT]; - - //- Lifecycle: in-place structure inside Kernel only - public: - Xsi(SharedLibrary &lib); - ~Xsi() {} - private: - Xsi(Xsi const&) = delete; - Xsi& operator=(Xsi const&) = delete; - - //- Handle Update --------------- - public: - void setHandle(xsiHandle hdl) { _hdl = hdl; } - - //- XSI Function Invocation ----- - public: - template - auto invoke(Args&&... args) const { - auto const f = decltype(std::get(type_map()))(_func[FID]); - return (*f)(_hdl, std::forward(args)...); - } - - }; // class Xsi - -private: - // Instance State - SharedLibrary _kernel_lib; // Backing Kernel Library - Xsi _xsi; // XSI Dispatch Table - - // Optional State once a Design in open - SharedLibrary _design_lib; - unsigned _port_count; - std::unique_ptr _ports; - -public: - Kernel(std::string const &kernel_lib); - Kernel(Kernel const&) = delete; - Kernel& operator=(Kernel const&) = delete; - ~Kernel(); - - // Interface reserved for forwarded access through open Design -private: - friend Design; - friend Port; - template - auto xsi(Args&&... args) const { - return _xsi.invoke(std::forward(args)...); - } - - // Port Accessors inlined below and public through Design - Port* getPort(char const *const name); - Port const* getPort(char const *const name) const; - enumerator ports(); - enumerator ports() const; - - // Design con- & destruction hooks - void open(std::string const &design_lib, s_xsi_setup_info const &setup_info); - void close() noexcept; - -public: - // Hex printing manipulation - static void hex_in_lower(); - static void hex_in_upper(); - -}; // class Kernel - -//=========================================================================== -// xsi::Design - -// - non-copyable, non-movable handle for exposing simulation control. -class Design { - using Xsi = Kernel::Xsi; - Kernel &_kernel; - -public: - Design( - Kernel &kernel, - std::string const &design_lib, - s_xsi_setup_info const &setup_info - ) : _kernel(kernel) { kernel.open(design_lib, setup_info); } - Design( - Kernel &kernel, std::string const &design_lib, - char const *const log_file = nullptr, - char const *const wdb_file = nullptr - ) : Design(kernel, design_lib, s_xsi_setup_info { - .logFileName = const_cast(log_file), - .wdbFileName = const_cast(wdb_file) - }) {} - ~Design() { _kernel.close(); } - -private: - Design(Design const&) = delete; - Design& operator*(Design const&) = delete; - - //----------------------------------------------------------------------- - // Forwarded Access to Open Simulation - - // Simulation Control & Status -public: - void trace_all() { _kernel.xsi(); } - void run(XSI_INT64 const step) { _kernel.xsi(step); } - void restart() { _kernel.xsi(); } - - int get_status() const { return _kernel.xsi(); } - char const* get_error_info() const { return _kernel.xsi(); } - - // Port Access -public: - int num_ports() const { return _kernel._port_count; } - - Port* getPort(std::string const &name) { return _kernel.getPort(name.c_str()); } - Port const* getPort(std::string const &name) const { return _kernel.getPort(name.c_str()); } - - enumerator ports() { return _kernel.ports(); } - enumerator ports() const { return const_cast(_kernel).ports(); } - -}; // class Design - -//=========================================================================== -// xsi::Port - -// Only exists within controlled environment within Kernel with open Design. -class Port { - using Xsi = Kernel::Xsi; - Kernel &_kernel; - unsigned const _id; - std::unique_ptr const _buf; - -private: - // Con- and destruction under full control of Kernel - friend class Kernel; - Port() : _kernel(*static_cast(nullptr)), _id(0), _buf() {} - Port(Kernel &kernel, unsigned const id) - : _kernel(kernel), _id(id), - _buf(std::make_unique((width()+31)/32)) {} - Port(Port const&) = delete; - Port& operator=(Port const&) = delete; -public: - ~Port() {} - -public: - char const* name() const { return _kernel.xsi(_id, xsiNameTopPort); } - int dir() const { return _kernel.xsi(_id, xsiDirectionTopPort); } - unsigned width() const { return _kernel.xsi(_id, xsiHDLValueSize); } - - bool isInput() const { return dir() == xsiInputPort; } - bool isOutput() const { return dir() == xsiOutputPort; } - bool isInout() const { return dir() == xsiInoutPort; } - -private: - s_xsi_vlog_logicval* buf() { return _buf.get(); } - s_xsi_vlog_logicval const* buf() const { return _buf.get(); } - -public: - // Buffer Synchronization - Port& read() { - _kernel.xsi(_id, buf()); - return *this; - } - void write_back() { - _kernel.xsi(_id, buf()); - } - - // Inspection - bool hasUnknown() const; - bool isZero() const; - bool operator[](unsigned const idx) const { - return (buf()[idx/32].aVal >> (idx%32)) & 1; - } - - bool as_bool() const { return buf()->aVal & 1; } - unsigned as_unsigned() const { return buf()->aVal; } - std::string as_binstr() const; - std::string as_hexstr() const; - - // Manipulation - Port& clear(); - Port& set(unsigned val) { - s_xsi_vlog_logicval *const p = buf(); - p->aVal = val; - p->bVal = 0; - return *this; - } - Port& set_binstr(std::string const &val); - Port& set_hexstr(std::string const &val); - -}; // class Port - -// Inlined Kernel Port Accessors - -inline Port* Kernel::getPort(char const *const name) { - int const id = xsi(name); - return (id == -1)? nullptr : &_ports[id]; -} -inline Port const* Kernel::getPort(char const *const name) const { - int const id = xsi(name); - return (id == -1)? nullptr : &_ports[id]; -} - -inline enumerator Kernel::ports() { - Port *const beg = _ports.get(); - return { beg, beg + _port_count }; -} -inline enumerator Kernel::ports() const { - Port const *const beg = _ports.get(); - return { beg, beg + _port_count }; -} - -} // namespace xsi - -#endif diff --git a/finn_xsi/testcase/StreamingEltwise_hls_0.v b/finn_xsi/testcase/StreamingEltwise_hls_0.v deleted file mode 100644 index f5207e0548..0000000000 --- a/finn_xsi/testcase/StreamingEltwise_hls_0.v +++ /dev/null @@ -1,349 +0,0 @@ -// ============================================================== -// Generated by Vitis HLS v2024.2 -// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. -// Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// ============================================================== - -`timescale 1 ns / 1 ps - -(* CORE_GENERATION_INFO="StreamingEltwise_hls_0_StreamingEltwise_hls_0,hls_ip_2024_2,{HLS_INPUT_TYPE=cxx,HLS_INPUT_FLOAT=0,HLS_INPUT_FIXED=0,HLS_INPUT_PART=xc7z020-clg400-1,HLS_INPUT_CLOCK=5.000000,HLS_INPUT_ARCH=others,HLS_SYN_CLOCK=4.826000,HLS_SYN_LAT=10,HLS_SYN_TPT=none,HLS_SYN_MEM=0,HLS_SYN_DSP=0,HLS_SYN_FF=7,HLS_SYN_LUT=101,HLS_VERSION=2024_2}" *) - -module StreamingEltwise_hls_0 ( - ap_clk, - ap_rst_n, - in0_V_TVALID, - in1_V_TVALID, - out_V_TREADY, - in0_V_TDATA, - in0_V_TREADY, - in1_V_TDATA, - in1_V_TREADY, - out_V_TDATA, - out_V_TVALID -); - -parameter ap_ST_iter0_fsm_state1 = 1'd1; -parameter ap_ST_iter1_fsm_state2 = 2'd2; -parameter ap_ST_iter1_fsm_state0 = 2'd1; - -input ap_clk; -input ap_rst_n; -input in0_V_TVALID; -input in1_V_TVALID; -input out_V_TREADY; -input [7:0] in0_V_TDATA; -output in0_V_TREADY; -input [7:0] in1_V_TDATA; -output in1_V_TREADY; -output [15:0] out_V_TDATA; -output out_V_TVALID; - - reg ap_rst_n_inv; -reg [0:0] ap_CS_iter0_fsm; -wire ap_CS_iter0_fsm_state1; -reg ap_block_state1_pp0_stage0_iter0; -reg [1:0] ap_CS_iter1_fsm; -wire regslice_both_out_V_U_apdone_blk; -reg ap_block_state2_pp0_stage0_iter1; -wire ap_CS_iter1_fsm_state2; -wire [0:0] icmp_ln82_fu_110_p2; -reg ap_condition_exit_pp0_iter0_stage0; -reg ap_ready_int; -reg in0_V_TDATA_blk_n; -reg in1_V_TDATA_blk_n; -reg out_V_TDATA_blk_n; -reg [0:0] icmp_ln82_reg_133; -wire [0:0] icmp_ln82_reg_133_pp0_iter0_reg; -reg [2:0] i1_fu_50; -wire [2:0] i_fu_104_p2; -wire ap_loop_init; -reg [2:0] ap_sig_allocacmp_i1_load; -wire [3:0] in0_slice_channels_fu_81_p1; -wire [8:0] zext_ln20_fu_85_p1; -wire [8:0] zext_ln20_1_fu_89_p1; -wire [8:0] outElem_fu_93_p2; -reg [0:0] ap_NS_iter0_fsm; -reg [1:0] ap_NS_iter1_fsm; -reg ap_ST_iter0_fsm_state1_blk; -reg ap_ST_iter1_fsm_state2_blk; -wire ap_start_int; -wire ap_ready_sig; -wire ap_done_sig; -wire ap_continue_int; -wire regslice_both_in0_V_U_apdone_blk; -wire [7:0] in0_V_TDATA_int_regslice; -wire in0_V_TVALID_int_regslice; -reg in0_V_TREADY_int_regslice; -wire regslice_both_in0_V_U_ack_in; -wire regslice_both_in1_V_U_apdone_blk; -wire [7:0] in1_V_TDATA_int_regslice; -wire in1_V_TVALID_int_regslice; -reg in1_V_TREADY_int_regslice; -wire regslice_both_in1_V_U_ack_in; -wire [15:0] out_V_TDATA_int_regslice; -reg out_V_TVALID_int_regslice; -wire out_V_TREADY_int_regslice; -wire regslice_both_out_V_U_vld_out; -reg ap_condition_50; -wire ap_ce_reg; - -// power-on initialization -initial begin -#0 ap_CS_iter0_fsm = 1'd1; -#0 ap_CS_iter1_fsm = 2'd1; -#0 i1_fu_50 = 3'd0; -end - -StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont flow_control_loop_pipe_no_ap_cont_U( - .ap_clk(ap_clk), - .ap_rst(ap_rst_n_inv), - .ap_start(1'b1), - .ap_ready(ap_ready_sig), - .ap_done(ap_done_sig), - .ap_start_int(ap_start_int), - .ap_loop_init(ap_loop_init), - .ap_ready_int(ap_ready_int), - .ap_loop_exit_ready(ap_condition_exit_pp0_iter0_stage0), - .ap_loop_exit_done(1'b0), - .ap_continue_int(ap_continue_int), - .ap_done_int(1'b0) -); - -StreamingEltwise_hls_0_regslice_both #( - .DataWidth( 8 )) -regslice_both_in0_V_U( - .ap_clk(ap_clk), - .ap_rst(ap_rst_n_inv), - .data_in(in0_V_TDATA), - .vld_in(in0_V_TVALID), - .ack_in(regslice_both_in0_V_U_ack_in), - .data_out(in0_V_TDATA_int_regslice), - .vld_out(in0_V_TVALID_int_regslice), - .ack_out(in0_V_TREADY_int_regslice), - .apdone_blk(regslice_both_in0_V_U_apdone_blk) -); - -StreamingEltwise_hls_0_regslice_both #( - .DataWidth( 8 )) -regslice_both_in1_V_U( - .ap_clk(ap_clk), - .ap_rst(ap_rst_n_inv), - .data_in(in1_V_TDATA), - .vld_in(in1_V_TVALID), - .ack_in(regslice_both_in1_V_U_ack_in), - .data_out(in1_V_TDATA_int_regslice), - .vld_out(in1_V_TVALID_int_regslice), - .ack_out(in1_V_TREADY_int_regslice), - .apdone_blk(regslice_both_in1_V_U_apdone_blk) -); - -StreamingEltwise_hls_0_regslice_both #( - .DataWidth( 16 )) -regslice_both_out_V_U( - .ap_clk(ap_clk), - .ap_rst(ap_rst_n_inv), - .data_in(out_V_TDATA_int_regslice), - .vld_in(out_V_TVALID_int_regslice), - .ack_in(out_V_TREADY_int_regslice), - .data_out(out_V_TDATA), - .vld_out(regslice_both_out_V_U_vld_out), - .ack_out(out_V_TREADY), - .apdone_blk(regslice_both_out_V_U_apdone_blk) -); - -always @ (posedge ap_clk) begin - if (ap_rst_n_inv == 1'b1) begin - ap_CS_iter0_fsm <= ap_ST_iter0_fsm_state1; - end else begin - ap_CS_iter0_fsm <= ap_NS_iter0_fsm; - end -end - -always @ (posedge ap_clk) begin - if (ap_rst_n_inv == 1'b1) begin - ap_CS_iter1_fsm <= ap_ST_iter1_fsm_state0; - end else begin - ap_CS_iter1_fsm <= ap_NS_iter1_fsm; - end -end - -always @ (posedge ap_clk) begin - if ((1'b1 == ap_condition_50)) begin - i1_fu_50 <= i_fu_104_p2; - end -end - -always @ (posedge ap_clk) begin - if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin - icmp_ln82_reg_133 <= icmp_ln82_fu_110_p2; - end -end - -always @ (*) begin - if ((1'b1 == ap_block_state1_pp0_stage0_iter0)) begin - ap_ST_iter0_fsm_state1_blk = 1'b1; - end else begin - ap_ST_iter0_fsm_state1_blk = 1'b0; - end -end - -always @ (*) begin - if ((1'b1 == ap_block_state2_pp0_stage0_iter1)) begin - ap_ST_iter1_fsm_state2_blk = 1'b1; - end else begin - ap_ST_iter1_fsm_state2_blk = 1'b0; - end -end - -always @ (*) begin - if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (icmp_ln82_fu_110_p2 == 1'd1) & (1'b1 == ap_CS_iter0_fsm_state1))) begin - ap_condition_exit_pp0_iter0_stage0 = 1'b1; - end else begin - ap_condition_exit_pp0_iter0_stage0 = 1'b0; - end -end - -always @ (*) begin - if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin - ap_ready_int = 1'b1; - end else begin - ap_ready_int = 1'b0; - end -end - -always @ (*) begin - if (((ap_loop_init == 1'b1) & (1'b1 == ap_CS_iter0_fsm_state1))) begin - ap_sig_allocacmp_i1_load = 3'd0; - end else begin - ap_sig_allocacmp_i1_load = i1_fu_50; - end -end - -always @ (*) begin - if ((1'b1 == ap_CS_iter0_fsm_state1)) begin - in0_V_TDATA_blk_n = in0_V_TVALID_int_regslice; - end else begin - in0_V_TDATA_blk_n = 1'b1; - end -end - -always @ (*) begin - if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin - in0_V_TREADY_int_regslice = 1'b1; - end else begin - in0_V_TREADY_int_regslice = 1'b0; - end -end - -always @ (*) begin - if ((1'b1 == ap_CS_iter0_fsm_state1)) begin - in1_V_TDATA_blk_n = in1_V_TVALID_int_regslice; - end else begin - in1_V_TDATA_blk_n = 1'b1; - end -end - -always @ (*) begin - if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin - in1_V_TREADY_int_regslice = 1'b1; - end else begin - in1_V_TREADY_int_regslice = 1'b0; - end -end - -always @ (*) begin - if (((1'b1 == ap_CS_iter1_fsm_state2) | (1'b1 == ap_CS_iter0_fsm_state1))) begin - out_V_TDATA_blk_n = out_V_TREADY_int_regslice; - end else begin - out_V_TDATA_blk_n = 1'b1; - end -end - -always @ (*) begin - if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin - out_V_TVALID_int_regslice = 1'b1; - end else begin - out_V_TVALID_int_regslice = 1'b0; - end -end - -always @ (*) begin - case (ap_CS_iter0_fsm) - ap_ST_iter0_fsm_state1 : begin - ap_NS_iter0_fsm = ap_ST_iter0_fsm_state1; - end - default : begin - ap_NS_iter0_fsm = 'bx; - end - endcase -end - -always @ (*) begin - case (ap_CS_iter1_fsm) - ap_ST_iter1_fsm_state2 : begin - if (((1'b1 == ap_CS_iter0_fsm_state1) & (1'b0 == ap_block_state2_pp0_stage0_iter1) & (1'b0 == ap_block_state1_pp0_stage0_iter0))) begin - ap_NS_iter1_fsm = ap_ST_iter1_fsm_state2; - end else if (((1'b0 == ap_block_state2_pp0_stage0_iter1) & ((1'b0 == ap_CS_iter0_fsm_state1) | ((1'b1 == ap_CS_iter0_fsm_state1) & (1'b1 == ap_block_state1_pp0_stage0_iter0))))) begin - ap_NS_iter1_fsm = ap_ST_iter1_fsm_state0; - end else if (((icmp_ln82_reg_133_pp0_iter0_reg == 1'd1) & (1'b1 == ap_CS_iter1_fsm_state2) & (1'b0 == ap_block_state2_pp0_stage0_iter1))) begin - ap_NS_iter1_fsm = ap_ST_iter0_fsm_state1; - end else begin - ap_NS_iter1_fsm = ap_ST_iter1_fsm_state2; - end - end - ap_ST_iter1_fsm_state0 : begin - if ((~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1))) begin - ap_NS_iter1_fsm = ap_ST_iter1_fsm_state2; - end else begin - ap_NS_iter1_fsm = ap_ST_iter1_fsm_state0; - end - end - default : begin - ap_NS_iter1_fsm = 'bx; - end - endcase -end - -assign ap_CS_iter0_fsm_state1 = ap_CS_iter0_fsm[32'd0]; - -assign ap_CS_iter1_fsm_state2 = ap_CS_iter1_fsm[32'd1]; - -always @ (*) begin - ap_block_state1_pp0_stage0_iter0 = ((out_V_TREADY_int_regslice == 1'b0) | (in1_V_TVALID_int_regslice == 1'b0) | (in0_V_TVALID_int_regslice == 1'b0)); -end - -always @ (*) begin - ap_block_state2_pp0_stage0_iter1 = ((regslice_both_out_V_U_apdone_blk == 1'b1) | (out_V_TREADY_int_regslice == 1'b0)); -end - -always @ (*) begin - ap_condition_50 = (~((1'b1 == ap_block_state1_pp0_stage0_iter0) | ((1'b1 == ap_CS_iter1_fsm_state2) & (1'b1 == ap_block_state2_pp0_stage0_iter1))) & (1'b1 == ap_CS_iter0_fsm_state1)); -end - -always @ (*) begin - ap_rst_n_inv = ~ap_rst_n; -end - -assign i_fu_104_p2 = (ap_sig_allocacmp_i1_load + 3'd1); - -assign icmp_ln82_fu_110_p2 = ((ap_sig_allocacmp_i1_load == 3'd7) ? 1'b1 : 1'b0); - -assign icmp_ln82_reg_133_pp0_iter0_reg = icmp_ln82_reg_133; - -assign in0_V_TREADY = regslice_both_in0_V_U_ack_in; - -assign in0_slice_channels_fu_81_p1 = in0_V_TDATA_int_regslice[3:0]; - -assign in1_V_TREADY = regslice_both_in1_V_U_ack_in; - -assign outElem_fu_93_p2 = (zext_ln20_fu_85_p1 - zext_ln20_1_fu_89_p1); - -assign out_V_TDATA_int_regslice = outElem_fu_93_p2; - -assign out_V_TVALID = regslice_both_out_V_U_vld_out; - -assign zext_ln20_1_fu_89_p1 = in1_V_TDATA_int_regslice; - -assign zext_ln20_fu_85_p1 = in0_slice_channels_fu_81_p1; - -endmodule //StreamingEltwise_hls_0 diff --git a/finn_xsi/testcase/StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont.v b/finn_xsi/testcase/StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont.v deleted file mode 100644 index e3ff4d1e48..0000000000 --- a/finn_xsi/testcase/StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont.v +++ /dev/null @@ -1,103 +0,0 @@ -// ============================================================== -// Vitis HLS - High-Level Synthesis from C, C++ and OpenCL v2024.2 (64-bit) -// Tool Version Limit: 2024.11 -// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. -// Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// -// ============================================================== - -`timescale 1 ns / 1 ps - -module StreamingEltwise_hls_0_flow_control_loop_pipe_no_ap_cont( - ap_clk, - ap_rst, - ap_start, - ap_ready, - ap_done, - ap_start_int, - ap_ready_int, - ap_done_int, - ap_continue_int, - ap_loop_init, - ap_loop_exit_ready, - ap_loop_exit_done -); - -input ap_clk; -input ap_rst; - -//Block level handshake with outside loop -input ap_start; -output ap_ready; -output ap_done; - -//Block level handshake with loop body -output ap_start_int; -input ap_ready_int; -input ap_done_int; -output ap_continue_int; - -//Init live in variables -output ap_loop_init; -reg ap_loop_init; -reg ap_done; -reg ap_done_cache; - -//Exit signal from loop body -input ap_loop_exit_ready; -input ap_loop_exit_done; - -// power-on initialization -initial begin -#0 ap_loop_init = 1'b1; -#0 ap_done_cache = 1'b0; -end - -assign ap_start_int = ap_start; - -assign ap_continue_int = 1'b1; - -assign ap_ready = ap_loop_exit_ready; - -//ap_loop_init is valid for the first II -//of the first loop run so as to enable -//the init block ops which are pushed into -//the first state of the pipeline region -always @ (posedge ap_clk) -begin - if (ap_rst == 1'b1) begin - ap_loop_init <= 1'b1; - end else if(ap_loop_exit_ready == 1'b1) begin - ap_loop_init <= 1'b1; - end else if(ap_ready_int == 1'b1) begin - ap_loop_init <= 1'b0; - end -end - -// if no ap_continue port and current module is not top module, -// ap_done handshakes with ap_start. Internally, flow control sends out -// ap_conintue_int = 1'b1 so the ap_done_int is asserted high for 1 clock cycle. -// ap_done_cache is used to record ap_done_int, and de-assert if ap_start_int -// is asserted, so DUT can start the next run -always @(posedge ap_clk) -begin - if (ap_rst == 1'b1) begin - ap_done_cache <= 1'b0; - end else if (ap_done_int == 1'b1) begin - ap_done_cache <= 1'b1; - end else if (ap_start_int == 1'b1) begin - ap_done_cache <= 1'b0; - end -end - -// if no ap_continue port and current module is not top module, ap_done handshakes with ap_start -always @(*) -begin - if ((ap_done_int == 1'b1) || ((ap_done_cache == 1'b1) && (ap_start_int == 1'b0))) begin - ap_done = 1'b1; - end else begin - ap_done = 1'b0; - end -end - -endmodule diff --git a/finn_xsi/testcase/StreamingEltwise_hls_0_regslice_both.v b/finn_xsi/testcase/StreamingEltwise_hls_0_regslice_both.v deleted file mode 100644 index c2e16007cc..0000000000 --- a/finn_xsi/testcase/StreamingEltwise_hls_0_regslice_both.v +++ /dev/null @@ -1,110 +0,0 @@ -// ============================================================== -// Generated by Vitis HLS v2024.2 -// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved. -// Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved. -// ============================================================== -`timescale 1ns/1ps - -module StreamingEltwise_hls_0_regslice_both -#(parameter - DataWidth = 8 -) ( - // system signals - input wire ap_clk, - input wire ap_rst, - // slave side - input wire [DataWidth-1:0] data_in, - input wire vld_in, - output wire ack_in, - // master side - output wire [DataWidth-1:0] data_out, - output wire vld_out, - input wire ack_out, - output wire apdone_blk); - //------------------------Parameter---------------------- - // state - localparam [1:0] - ZERO = 2'b10, - ONE = 2'b11, - TWO = 2'b01; - //------------------------Local signal------------------- - reg [DataWidth-1:0] data_p1 = {DataWidth{1'b0}}; - reg [DataWidth-1:0] data_p2 = {DataWidth{1'b0}}; - wire load_p1; - wire load_p2; - wire load_p1_from_p2; - reg ack_in_t = 1'b0; - reg [1:0] state = 2'b00; - reg [1:0] next; - //------------------------Body--------------------------- - assign ack_in = ack_in_t; - assign data_out = data_p1; - assign vld_out = state[0]; - assign apdone_blk = (state == ONE && ~ack_out) || (state == TWO); - - assign load_p1 = (state == ZERO && vld_in) || - (state == ONE && vld_in && ack_out) || - (state == TWO && ack_out); - assign load_p2 = vld_in & ack_in; - assign load_p1_from_p2 = (state == TWO); - - // data_p1 - always @(posedge ap_clk) begin - if (load_p1) begin - if (load_p1_from_p2) - data_p1 <= data_p2; - else - data_p1 <= data_in; - end - end - - // data_p2 - always @(posedge ap_clk) begin - if (load_p2) data_p2 <= data_in; - end - - // ack_in_t - always @(posedge ap_clk) begin - if (ap_rst) - ack_in_t <= 1'b0; - else if (state == ZERO) - ack_in_t <= 1'b1; - else if (state == ONE && next == TWO) - ack_in_t <= 1'b0; - else if (state == TWO && next == ONE) - ack_in_t <= 1'b1; - end - - // state - always @(posedge ap_clk) begin - if (ap_rst) - state <= ZERO; - else - state <= next; - end - - // next - always @(*) begin - case (state) - ZERO: - if (vld_in & ack_in) - next = ONE; - else - next = ZERO; - ONE: - if (~vld_in & ack_out) - next = ZERO; - else if (vld_in & ~ack_out) - next = TWO; - else - next = ONE; - TWO: - if (ack_out) - next = ONE; - else - next = TWO; - default: - next = ZERO; - endcase - end -endmodule diff --git a/pyproject.toml b/pyproject.toml index 6577b95938..4f03a52311 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -183,5 +183,6 @@ ignore = [ "ANN401", # Don't use the "Any" type "D413", # Blank lines at docstring end "D205", # Blank line after summary (enables multiline summaries) - "N801", # Class names should use CapWords convention + "N801", # Class name should use CapWords convention + "D209", # Multi-line docstring closing quotes should be on a separate line ] diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index 3e596add9d..499fa721cb 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -102,9 +102,12 @@ def get_default_session_options_new(): is_followup = True save_dir = save_dir + "_followup" else: - config_path = os.path.join("ci", "cfg", config_name + ".yml") + if config_name.endswith(".yaml") or config_name.endswith(".yml"): + config_path = config_name + else: + config_path = os.path.join("ci", "cfg", config_name + ".yml") print("Job launched with SLURM ID: %d" % (job_id)) - except KeyError: + except KeyError as e: # Launched without SLURM, assume test run on local machine job_id = 0 experiment_dir = "bench_output/" + time.strftime("%d_%H_%M") diff --git a/src/finn/benchmarking/dut/resnet18.yml b/src/finn/benchmarking/dut/resnet18.yml index f427c33e83..fb8a6589fe 100644 --- a/src/finn/benchmarking/dut/resnet18.yml +++ b/src/finn/benchmarking/dut/resnet18.yml @@ -1,3 +1,7 @@ +model_path: models/resnet18/resnet18_w3a3_cifar100.onnx +folding_config_file: models/resnet18/resnet18_folding_config.json +specialize_layers_config_file: models/resnet18/resnet18_specialize_layers.json + steps: - step_qonnx_to_finn - step_tidy_up @@ -11,13 +15,13 @@ steps: - step_apply_folding_config - step_minimize_bit_width - step_generate_estimate_reports - - step_build_simulation - - step_size_fifo_connected - - step_apply_fifosizes - - step_generate_estimate_reports + - step_set_fifo_depths - step_hw_codegen - step_hw_ipgen - step_create_stitched_ip - step_synthesize_bitfile - step_make_driver - step_deployment_package + +# Required to use RTL MVAUs +standalone_thresholds: true diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 3f4caa26ea..2114706642 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -52,6 +52,13 @@ from typing import Any, TextIO import finn.util.logging +from finn.util.exception import ( + FINNConfigurationError, + FINNDataflowError, + FINNError, + FINNUserError, +) +from finn.util.exception_snapshot import snapshot_on_exception from finn.builder.build_dataflow_config import ( DataflowBuildConfig, LogLevel, @@ -60,8 +67,6 @@ ) from finn.builder.build_dataflow_steps import build_dataflow_step_lookup from finn.util.basic import get_vivado_root -from finn.util.exception import FINNConfigurationError, FINNDataflowError, FINNError, FINNUserError -from finn.util.exception_snapshot import snapshot_on_exception from finn.util.logging import log from finn.util.settings import get_settings @@ -345,7 +350,7 @@ def create_model_wrapper(model_filename: str, cfg: DataflowBuildConfig) -> Model f"Building dataflow accelerator from intermediate" f" checkpoint {intermediate_model_filename}" ) - return ModelWrapper(intermediate_model_filename) + return ModelWrapper(str(intermediate_model_filename)) def build_dataflow_cfg(model_filename: str, cfg: DataflowBuildConfig) -> int: diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 32f28add90..75d4aa3468 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -97,6 +97,7 @@ class AutoFIFOSizingMethod(str, Enum): CHARACTERIZE = "characterize" LARGEFIFO_RTLSIM = "largefifo_rtlsim" + DISTRIBUTED_SIMULATION = "distributed_sim" class ShellFlowType(str, Enum): @@ -455,6 +456,10 @@ def _fix_path(p: Path | None) -> Path | None: #: Enables experimental live FIFO sizing on the FPGA. live_fifo_sizing: bool = False + #: Whether to use functional simulation when available. Takes some time + #: to synthesize, but results in much faster simulations. + functional_simulation: bool = True + #: Whether FIFO nodes with depth larger than 32768 will be split. #: Allow to configure very large FIFOs in the folding_config_file. split_large_fifos: bool = False @@ -638,6 +643,10 @@ def _resolve_fpga_part(self) -> str: """ if self.fpga_part is None: # lookup from part map if not specified + if self.board is None: + raise FINNConfigurationError( + "Either board or fpga_part must be specified in flow config." + ) try: fpga_part = part_map[self.board] return fpga_part @@ -715,11 +724,11 @@ def _resolve_verification_io_pair(self) -> None | tuple[Any, Any]: if self.verify_steps is None: return None if not Path(self.verify_input_npy).is_file(): - raise FINNConfigurationError("verify_input_npy not found: " + self.verify_input_npy) + raise FINNConfigurationError("verify_input_npy not found: " + str(self.verify_input_npy)) verify_input_npy = np.load(self.verify_input_npy) if not Path(self.verify_expected_output_npy).is_file(): raise FINNConfigurationError( - "verify_expected_output_npy not found: " + self.verify_expected_output_npy + "verify_expected_output_npy not found: " + str(self.verify_expected_output_npy) ) verify_expected_output_npy = np.load(self.verify_expected_output_npy) return ( diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index a0d969f8c5..f9fd44d50b 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -69,6 +69,7 @@ DataflowOutputType, ShellFlowType, VerificationStepType, + AutoFIFOSizingMethod ) from finn.builder.passes import step_passes_frontend from finn.core.onnx_exec import execute_onnx @@ -683,6 +684,61 @@ def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): return model +# TODO: Both this and the step_size_... steps will be reworked before merging into dev +# TODO: These are also included in step_set_fifo_depths if the correct FIFO sizing method +# was selected +def step_build_simulation(model: ModelWrapper, cfg: DataflowBuildConfig) -> ModelWrapper: + """Build the simulation binaries for isolated and connected simulations.""" + from finn.transformation.fpgadataflow.simulation_build import BuildSimulation + + model = model.transform( + BuildSimulation( + cfg._resolve_fpga_part(), # noqa + cfg._resolve_hls_clk_period(), # noqa + cfg.functional_simulation, + ) + ) + return model + + +def step_size_fifo_isolated(model: ModelWrapper, cfg: DataflowBuildConfig) -> ModelWrapper: + """Simulate layers in isolation and use the observed behaviour to size the FIFOs accordingly.""" + from pathlib import Path + + from finn.transformation.fpgadataflow.simulation_isolated import RunLayerIsolatedSimulation + + model = model.transform( + RunLayerIsolatedSimulation( + cfg._resolve_fpga_part(), # noqa + cfg._resolve_hls_clk_period(), # noqa + cfg.functional_simulation, + Path(cfg.output_dir), + ) + ) + return model + + +def step_size_fifo_connected(model: ModelWrapper, cfg: DataflowBuildConfig) -> ModelWrapper: + """Simulate layers connected and use the observed behaviour to size the FIFOs accordingly.""" + from finn.transformation.fpgadataflow.simulation_connected import RunLayerParallelSimulation + + model = model.transform( + RunLayerParallelSimulation( + cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period(), cfg # noqa # noqa + ) + ) + return model + + +def step_apply_fifosizes(model: ModelWrapper, cfg: DataflowBuildConfig) -> ModelWrapper: + """Apply the previously found FIFO sizes to the model.""" + from finn.transformation.fpgadataflow.simulation import ApplyFIFOSizes + + model = model.transform(ApplyFIFOSizes(cfg)) + model = model.transform(SplitLargeFIFOs(max_qsrl_depth=256)) + return model + + def step_insert_dwc(model: ModelWrapper, cfg: DataflowBuildConfig): """Inserts data width converters between layers where necessary.""" model = model.transform(InsertDWC()) @@ -828,6 +884,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): ) # InsertAndSetFIFODepths internally removes any shallow FIFOs # so no need to call RemoveShallowFIFOs here + elif cfg.auto_fifo_strategy == AutoFIFOSizingMethod.DISTRIBUTED_SIMULATION: + # TODO: When merging into dev, this should be finalized + model = step_build_simulation(model, cfg) + model = step_size_fifo_connected(model, cfg) + model = step_apply_fifosizes(model, cfg) + return model else: assert "Unsupported auto_fifo_strategy: " + cfg.auto_fifo_strategy else: @@ -1234,7 +1296,10 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): "step_generate_estimate_reports": step_generate_estimate_reports, "step_hw_codegen": step_hw_codegen, "step_hw_ipgen": step_hw_ipgen, - "step_insert_dwc": step_insert_dwc, + "step_build_simulation": step_build_simulation, + "step_size_fifo_isolated": step_size_fifo_isolated, + "step_size_fifo_connected": step_size_fifo_connected, + "step_apply_fifosizes": step_apply_fifosizes, "step_set_fifo_depths": step_set_fifo_depths, "step_create_stitched_ip": step_create_stitched_ip, "step_measure_rtlsim_performance": step_measure_rtlsim_performance, diff --git a/src/finn/builder/custom_step_library/resnet.py b/src/finn/builder/custom_step_library/resnet.py index 02d7f5ff58..1781bd8636 100644 --- a/src/finn/builder/custom_step_library/resnet.py +++ b/src/finn/builder/custom_step_library/resnet.py @@ -34,35 +34,69 @@ hardware conversion. """ +from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine from qonnx.transformation.composed import ComposedTransformation from qonnx.transformation.double_to_single_float import DoubleToSingleFloat from qonnx.transformation.fold_constants import FoldConstants from qonnx.transformation.general import ( + ConvertDivToMul, + ConvertSubToAdd, GiveReadableTensorNames, GiveUniqueNodeNames, GiveUniqueParameterTensors, + RemoveStaticGraphInputs, RemoveUnusedTensors, SortGraph, ) from qonnx.transformation.infer_data_layouts import InferDataLayouts from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.insert_topk import InsertTopK from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.transformation.remove import RemoveIdentityOps import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.builder.build_dataflow_config import DataflowBuildConfig from finn.transformation.fpgadataflow.replicate_stream import InferReplicateStream from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.streamline.absorb import ( + Absorb1BitMulIntoConv, + Absorb1BitMulIntoMatMul, AbsorbAddIntoMultiThreshold, + AbsorbConsecutiveTransposes, + AbsorbMulIntoMultiThreshold, + AbsorbScalarMulAddIntoTopK, AbsorbSignBiasIntoMultiThreshold, AbsorbTransposeIntoMultiThreshold, + FactorOutMulSignMagnitude, +) +from finn.transformation.streamline.collapse_repeated import ( + CollapseRepeatedAdd, + CollapseRepeatedMul, ) from finn.transformation.streamline.remove import RemoveIdentityReshape, RemoveIdentityTranspose # just for not linear -from finn.transformation.streamline.reorder import MoveMulPastAdd +# just for not linear +from finn.transformation.streamline.reorder import ( + MoveAddPastConv, + MoveAddPastMul, + MoveLinearPastEltwiseAdd, + MoveLinearPastFork, + MoveMaxPoolPastMultiThreshold, + MoveMulPastAdd, + MoveScalarAddPastMatMul, + MoveScalarLinearPastInvariants, + MoveScalarMulPastConv, + MoveScalarMulPastMatMul, + MoveTransposePastEltwise, + MoveTransposePastFork, + MoveTransposePastJoinAdd, +) +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds +from finn.transformation.streamline.sign_to_thres import ConvertSignToThres from finn.transformation.streamline.streamline_plus import StreamlinePlus as Streamline @@ -144,3 +178,155 @@ def step_resnet_convert_to_hw( model = model.transform(RemoveUnusedTensors()) model = model.transform(SortGraph()) return model + + +# For backwards compatibility + + +def step_resnet50_tidy(model: ModelWrapper, cfg: DataflowBuildConfig): + """Tidy up ResNet-50 models (backwards-compatible legacy step). + + Applies shape and datatype inference, constant folding, unique naming, and + inserts a TopK layer at the output. + """ + model = model.transform(GiveUniqueParameterTensors()) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(RemoveStaticGraphInputs()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(InsertTopK()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + return model + + +def step_resnet50_streamline_linear(model: ModelWrapper, cfg: DataflowBuildConfig): + """Apply linear streamlining transformations to a ResNet-50 model. + + Moves and absorbs scalar linear operations (mul, add) past convolutions and + matrix multiplications, collapses repeated operations, converts sign nodes + to thresholds, and absorbs values into multithreshold nodes. + """ + streamline_transformations = [ + AbsorbScalarMulAddIntoTopK(), # before MoveAddPastMul to avoid int->float + ConvertSubToAdd(), + ConvertDivToMul(), + RemoveIdentityOps(), + CollapseRepeatedMul(), + BatchNormToAffine(), + ConvertSignToThres(), + MoveAddPastMul(), + MoveScalarAddPastMatMul(), + MoveAddPastConv(), + MoveScalarMulPastMatMul(), + MoveScalarMulPastConv(), + MoveScalarLinearPastInvariants(), + MoveAddPastMul(), + CollapseRepeatedAdd(), + CollapseRepeatedMul(), + AbsorbAddIntoMultiThreshold(), + FactorOutMulSignMagnitude(), + MoveMaxPoolPastMultiThreshold(), + AbsorbMulIntoMultiThreshold(), + Absorb1BitMulIntoMatMul(), + Absorb1BitMulIntoConv(), + RoundAndClipThresholds(), + ] + for trn in streamline_transformations: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + return model + + +def step_resnet50_streamline_nonlinear(model: ModelWrapper, cfg: DataflowBuildConfig): + """Apply non-linear streamlining transformations to a ResNet-50 model. + + Moves linear operations past elementwise-add nodes and fork points to + enable further fusion in subsequent linear streamlining passes. + """ + streamline_transformations = [ + MoveLinearPastEltwiseAdd(), + MoveLinearPastFork(), + ] + for trn in streamline_transformations: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + return model + + +def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + """Streamline a ResNet-50 model (backwards-compatible legacy step). + + Iterates linear and non-linear streamlining passes, then lowers convolutions + to matrix multiplications and absorbs the resulting transpose operations. + """ + for iter_id in range(4): + model = step_resnet50_streamline_linear(model, cfg) + model = step_resnet50_streamline_nonlinear(model, cfg) + + # big loop tidy up + model = model.transform(RemoveUnusedTensors()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(SortGraph()) + + model = model.transform(DoubleToSingleFloat()) + + # Lower convolutions and streamline resulting transposes + model = model.transform(LowerConvsToMatMul()) + model = model.transform( + ComposedTransformation( + [ + MoveTransposePastJoinAdd(), + MoveTransposePastFork(), + MoveTransposePastEltwise(), + AbsorbConsecutiveTransposes(), + AbsorbTransposeIntoMultiThreshold(), + ] + ) + ) + return model + + +def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert a ResNet-50 model to hardware-specific operations (backwards-compatible legacy step). + + Sets the input datatype to UINT8, then sequentially converts channelwise + linear layers, pooling, matrix-vector activations, thresholding, convolution + input generators, stream duplication/addition, and label selection to their + corresponding HLS hardware layer variants. + """ + model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"]) + model = model.transform(InferDataLayouts()) + model = model.transform(DoubleToSingleFloat()) + model = model.transform(InferDataTypes()) + model = model.transform(SortGraph()) + + to_hw_transformations = [ + to_hw.InferChannelwiseLinearLayer, + to_hw.InferPool, + AbsorbConsecutiveTransposes, + RoundAndClipThresholds, + to_hw.InferQuantizedMatrixVectorActivation, + to_hw.InferThresholdingLayer, + to_hw.InferConvInpGen, + to_hw.InferDuplicateStreamsLayer, + to_hw.InferAddStreamsLayer, + to_hw.InferLabelSelectLayer, + ] + for trn in to_hw_transformations: + model = model.transform(trn()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(InferDataTypes()) + + model = model.transform(RemoveCNVtoFCFlatten()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(RemoveUnusedTensors()) + model = model.transform(SortGraph()) + + return model diff --git a/src/finn/builder/custom_step_library/transformer_adhoc.py b/src/finn/builder/custom_step_library/transformer_adhoc.py index cbcd28a916..70aae9d59e 100644 --- a/src/finn/builder/custom_step_library/transformer_adhoc.py +++ b/src/finn/builder/custom_step_library/transformer_adhoc.py @@ -150,7 +150,7 @@ def _set_folding_attention(model: ModelWrapper, target_cycles_per_frame): # parallelism in steps following the common divisors the inputs. for fold in reversed(common_divisors([qkdim, vdim])): # Configure the folding attribute - inst.set_nodeattr("EmbFold", fold) + inst.set_nodeattr("EmbFold", int(fold)) # Check if this is sufficient to meet the cycles target if inst.get_exp_cycles() <= target_cycles_per_frame: break @@ -159,7 +159,7 @@ def _set_folding_attention(model: ModelWrapper, target_cycles_per_frame): # parallelism in steps divisors of the key and value sequence. for fold in reversed(common_divisors([kvlen])): # Configure the folding attribute - inst.set_nodeattr("SeqFold", fold) + inst.set_nodeattr("SeqFold", int(fold)) # Check if this is sufficient to meet the cycles target if inst.get_exp_cycles() <= target_cycles_per_frame: break diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py index 9671ed3d71..f8a08260ca 100644 --- a/src/finn/core/rtlsim_exec.py +++ b/src/finn/core/rtlsim_exec.py @@ -29,6 +29,9 @@ import numpy as np import os +import shlex +import subprocess +import sys from pathlib import Path from qonnx.custom_op.registry import getCustomOp from subprocess import CalledProcessError @@ -41,7 +44,8 @@ make_build_dir, ) from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -from finn.util.exception import FINNConfigurationError, FINNError, FINNInternalError +from finn.util.logging import log +from finn.util.exception import FINNConfigurationError, FINNError, FINNInternalError, FINNUserError finnxsi = xsi if xsi.is_available() else None @@ -126,6 +130,10 @@ def file_to_basename(x: str | Path) -> str: def rtlsim_exec_cppxsi( model, execution_context, + is_single_node: bool, + total_nodes: int = 1, + current_node_index: int | None = None, + previous_node_name: str | None = None, dummy_data_mode=False, timeout_cycles=None, throttle_cycles=0, @@ -181,7 +189,8 @@ def rtlsim_exec_cppxsi( vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj") with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f: all_verilog_srcs = f.read().split() - single_src_dir = make_build_dir("rtlsim_" + top_module_name + "_") + rtlsim_name = model.graph.node[0].name if is_single_node else top_module_name + single_src_dir = make_build_dir("rtlsim_" + rtlsim_name + "_") debug = not (trace_file is None or trace_file == "") rtlsim_so = finnxsi.compile_sim_obj( top_module_name, all_verilog_srcs, single_src_dir, debug=debug, behav=True @@ -212,6 +221,7 @@ def rtlsim_exec_cppxsi( raise FINNInternalError("The finn_xsi directory could not be found. Stopping here.") # prepare the C++ sim driver template + finnxsi_dir = os.environ["FINN_XSI"] with fifosim_config_fname.open() as f: fifsom_config_template = f.read() @@ -273,62 +283,72 @@ def rtlsim_exec_cppxsi( "TOP_MODULE_NAME": top_module_name, # top-level AXI stream descriptors "ISTREAM_DESC": instream_descrs_str, + "ISTREAM_LEN": len(instream_names), "OSTREAM_DESC": outstream_descrs_str, + "OSTREAM_LEN": len(outstream_names), # control tracing and trace filename "TRACE_FILE": "nullptr" if trace_file is None else f'"{trace_file}"', # sim kernel .so to use (depends on Vivado version) "SIMKERNEL_SO": finnxsi.get_simkernel_so(), # log file for xsi (not the sim driver) "XSIM_LOG_FILE": '"xsi.log"', + # Node name in case of single-node simulation + "NODE_NAME": model.graph.node[0].name, + # Previous node name (for single node simulation) + "PREVIOUS_NODE_NAME": "std::nullopt" + if previous_node_name is None + else f'"{previous_node_name}"', + "NODE_INDEX": current_node_index if is_single_node else 0, + "TOTAL_NODES": total_nodes, } + + fifosim_config_fname = Path(finnxsi_dir) / "rtlsim_config.hpp.template" + fsim_config = fifosim_config_fname.read_text() for key, val in template_dict.items(): - fifsom_config_template = fifsom_config_template.replace(f"@{key}@", str(val)) - with open(sim_base + "/rtlsim_config.hpp", "w") as f: - f.write(fifsom_config_template) - - vivado_incl_dir = get_vivado_root() + "/data/xsim/include" - # launch g++ to compile the rtlsim executable - build_cmd = [ - "g++", - f"-I{finnxsi_dir}", - f"-I{vivado_incl_dir}", - f"-I{sim_base}", - "-std=c++17", - "-O3", - "-o", - "rtlsim_xsi", - f"{finnxsi_dir}/rtlsim_xsi.cpp", - f"{finnxsi_dir}/xsi_finn.cpp", - "-ldl", - "-lrt", - ] - # write compilation command to a file for easy re-running/debugging - with open(sim_base + "/compile_rtlsim.sh", "w") as f: - f.write(" ".join(build_cmd)) + fsim_config = fsim_config.replace(f"@{key}@", str(val)) + + # Write the config to the simulation directory + rtlsim_config = Path(sim_base) / "rtlsim_config.hpp" + rtlsim_config.write_text(fsim_config) + + # Building the whole simulation + # Running CMake first + cmake_call = f"{sys.executable} -m cmake -S {finnxsi_dir} -B {sim_base}" + log.info(f"Running cmake on RTLSIM Wrapper in {sim_base}") try: - launch_process_helper(build_cmd, cwd=sim_base, print_stdout=False) - except CalledProcessError: - raise FINNError("Failed to compile rtlsim executable") - if not os.path.isfile(sim_base + "/rtlsim_xsi"): - raise FINNError("Failed to compile rtlsim executable") - - # launch the rtlsim executable - runsim_cmd = ["bash", "run_rtlsim.sh"] - with open(sim_base + "/run_rtlsim.sh", "w") as f: - f.write("./rtlsim_xsi > rtlsim_xsi_log.txt") - launch_process_helper(runsim_cmd, cwd=sim_base) + launch_process_helper( + shlex.split(cmake_call), cwd=finnxsi_dir, print_stdout=True, proc_env=os.environ.copy() + ) + except CalledProcessError as e: + raise FINNError(f"Failed to run cmake in {sim_base}") from e + + # Calling make to actually build the simulation + makefile = Path(sim_base) / "Makefile" + if not makefile.exists(): + raise FINNUserError(f"Failed to create Makefile in {sim_base}!") + try: + launch_process_helper(["make"], proc_env=os.environ.copy(), cwd=sim_base) + except CalledProcessError as e: + raise FINNUserError(f"Failed to create executable in {sim_base}!") from e + + # TODO: Fix name for general rtlsim + simulation_executable = Path(sim_base) / "LayerSimulationBackend" + assert simulation_executable.exists() + + # Prepare the script to run the simulation + # (important to specify LD_LIBRARY_PATH here for XSI to work correctly) + runsim = Path(sim_base) / "run_fifosim.sh" + ld_library_path = get_vivado_root() + "/lib/lnx64.o" + runsim.write_text(f"LD_LIBRARY_PATH={ld_library_path}:$LD_LIBRARY_PATH {simulation_executable}") + + # Actually run the simulation + subprocess.run( + ["bash", runsim.name], cwd=sim_base, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) # parse results file and return dict - results_filename = sim_base + "/results.txt" - with open(results_filename, "r") as f: - results = f.read().strip().split("\n") - ret_dict = {} - for result_line in results: - key, val = result_line.split("\t") - ret_dict[key] = int(val) - if "TIMEOUT" in ret_dict.keys(): - assert ret_dict["TIMEOUT"] == 0, f"XSI C++ simulation timed out, see {results_filename}" - return ret_dict + # TODO + return {} def rtlsim_exec_finnxsi(model, execution_context, pre_hook=None, post_hook=None): diff --git a/src/finn/custom_op/fpgadataflow/elementwise_binary.py b/src/finn/custom_op/fpgadataflow/elementwise_binary.py index 3c2203b057..d33629b43e 100644 --- a/src/finn/custom_op/fpgadataflow/elementwise_binary.py +++ b/src/finn/custom_op/fpgadataflow/elementwise_binary.py @@ -35,13 +35,14 @@ from finn.custom_op.fpgadataflow import register_custom_op from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.custom_op.fpgadataflow.memstream import MemStreamSupport # FINN logging from finn.util.logging import log # Generic implementation for elementwise binary operations -class ElementwiseBinaryOperation(HWCustomOp): +class ElementwiseBinaryOperation(MemStreamSupport, HWCustomOp): # Specifies the elementwise operation to be implemented # Format: (Identifier, Python, C++, RTL) _operation: tuple[str, np.ufunc, str, str] | None = None diff --git a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py index 14ef567404..72bc3bd973 100644 --- a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py @@ -35,7 +35,7 @@ from finn.util.logging import log -class CheckSum_hls(HWCustomOp, HLSBackend): +class CheckSum_hls(HLSBackend, HWCustomOp): """Class that corresponds to custom_hls checksum function.""" def __init__(self, onnx_node, **kwargs): diff --git a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py index bf239dd056..6fb54ddcff 100644 --- a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py @@ -74,7 +74,7 @@ # -the folded shape is not defined -class IODMA_hls(HWCustomOp, HLSBackend): +class IODMA_hls(HLSBackend, HWCustomOp): """Class that corresponds to finn-hlslib DMA function(s).""" def __init__(self, onnx_node, **kwargs): diff --git a/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py index 610dd2f6ef..34f93705af 100644 --- a/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py @@ -31,7 +31,7 @@ from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class TLastMarker_hls(HWCustomOp, HLSBackend): +class TLastMarker_hls(HLSBackend, HWCustomOp): """Node that adds/removes AXI stream TLAST signals where needed. Its behavior is transparent in node-by-node execution, only visible in IP-stitched rtlsim or actual hardware. diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index 5da49c4d98..653aa5a3da 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -36,6 +36,7 @@ from finn import xsi from finn.custom_op.fpgadataflow import templates +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.templates import get_templates_folder from finn.util.basic import CppBuilder, launch_process_helper, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -47,7 +48,7 @@ finnxsi = xsi if xsi.is_available() else None -class HLSBackend(ABC): +class HLSBackend(HWCustomOp, ABC): """HLSBackend class all custom ops that correspond to a finn-hlslib function are using functionality of. Contains different functions every HLS custom node should have. Some as abstract methods, these have to be filled @@ -55,15 +56,19 @@ class HLSBackend(ABC): def get_nodeattr_types(self): """Return dictionary of node attribute types and properties.""" - return { - "code_gen_dir_cppsim": ("s", False, ""), - "executable_path": ("s", False, ""), - "res_hls": ("s", False, ""), - # temporary node attribute to keep track of interface style of hls ops - "cpp_interface": ("s", False, "packed", {"packed", "hls_vector"}), - # temporary node attribute to keep track of execution style of hls ops - "hls_style": ("s", False, "ifm_aware", {"ifm_aware", "freerunning"}), - } + super_types = super().get_nodeattr_types() + super_types.update( + { + "code_gen_dir_cppsim": ("s", False, ""), + "executable_path": ("s", False, ""), + "res_hls": ("s", False, ""), + # temporary node attribute to keep track of interface style of hls ops + "cpp_interface": ("s", False, "packed", {"packed", "hls_vector"}), + # temporary node attribute to keep track of execution style of hls ops + "hls_style": ("s", False, "ifm_aware", {"ifm_aware", "freerunning"}), + } + ) + return super_types def get_all_verilog_paths(self): """Return list of all folders containing Verilog code for this node.""" diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py index ce967d1d47..d6132f68ab 100644 --- a/src/finn/custom_op/fpgadataflow/hwcustomop.py +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -45,8 +45,9 @@ from qonnx.util.basic import roundup_to_integer_multiple from typing import TYPE_CHECKING, Any, cast -from finn import xsi from finn.util.basic import get_liveness_threshold_cycles, is_versal +from finn.util.deprecated import deprecated +from finn import xsi from finn.util.exception import FINNInternalError from finn.util.logging import log from finn.util.settings import get_settings @@ -56,6 +57,9 @@ finnxsi = xsi if xsi.is_available() else None +if TYPE_CHECKING: + from qonnx.core.modelwrapper import ModelWrapper + class HWCustomOp(CustomOp): """HWCustomOp class all custom ops that can be implemented with either @@ -334,10 +338,9 @@ def rtlsim_multi_io(self, sim: SimEngine, io_dict: dict[str, Any], sname: str = def verify_node(self) -> None: """Can be implemented to verify that all attributes the node needs are there and that particular attributes are set correctly. Can also - check if the number of inputs is equal to the expected number. - """ + check if the number of inputs is equal to the expected number.""" - def generate_params(self, model: "ModelWrapper", path: str) -> None: + def generate_params(self, model: Any, path: str) -> None: """Generate parameters (i.e. weights and thresholds). Member function of HWCustomOp class that must be implemented by every node @@ -514,6 +517,7 @@ def generate_hdl_dynload(self) -> None: with output_path.open("w") as f: f.write(template_wrapper) + @deprecated def derive_characteristic_fxns( self, period: int, override_rtlsim_dict: dict | None = None ) -> None: diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 4f2962a6d1..c87412a2d2 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -48,6 +48,7 @@ ) from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.custom_op.fpgadataflow.memstream import MemStreamSupport from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string from finn.util.logging import log from finn.util.settings import get_settings @@ -60,7 +61,7 @@ # the ... here can be any shape (representing groups of vectors) -class MVAU(HWCustomOp): +class MVAU(MemStreamSupport, HWCustomOp): """Abstraction layer for HW implementation of MatrixVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): diff --git a/src/finn/custom_op/fpgadataflow/memstream.py b/src/finn/custom_op/fpgadataflow/memstream.py new file mode 100644 index 0000000000..ee6305f26f --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/memstream.py @@ -0,0 +1,71 @@ +"""Support for memory stream operations in FPGA dataflow.""" + +import os +from pathlib import Path +from typing import cast + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.util.basic import is_versal + + +class MemStreamSupport(HWCustomOp): + """Custom Op for memory stream operations in FPGA dataflow.""" + + def calc_tmem(self) -> int: + """Abstract method to calculate threshold memory size. + The default implementation raises NotImplementedError because + some subclasses dont implement calc_tmem.""" + raise NotImplementedError() + + def calc_wmem(self) -> int: + """Abstract method to calculate weight memory size. + The default implementation raises NotImplementedError because + some subclasses dont implement calc_wmem.""" + raise NotImplementedError() + + def generate_hdl_memstream(self, fpgapart: str, pumped_memory: int = 0) -> None: + """Generate verilog code for memstream component. + + Currently utilized by MVAU, VVAU and HLS Thresholding layer. + + Args: + fpgapart: Target FPGA part string. + pumped_memory: Whether to use pumped memory (default: 0). + + """ + ops = ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl", "Thresholding_hls"] + if self.onnx_node.op_type in ops or self.onnx_node.op_type.startswith("Elementwise"): + template_path = ( + Path(os.environ["FINN_RTLLIB"]) / "memstream/hdl/memstream_wrapper_template.v" + ) + mname = self.onnx_node.name + if self.onnx_node.op_type.startswith("Thresholding"): + depth = self.calc_tmem() + else: + depth = self.calc_wmem() + padded_width = self.get_instream_width_padded(1) + code_gen_dir = cast("str", self.get_nodeattr("code_gen_dir_ipgen")) + + ram_style = cast("str", self.get_nodeattr("ram_style")) + init_file = str(Path(code_gen_dir) / "memblock.dat") + if ram_style == "ultra" and not is_versal(fpgapart): + init_file = "" + code_gen_dict = { + "$MODULE_NAME$": [mname], + "$SETS$": ["1"], + "$DEPTH$": [str(depth)], + "$WIDTH$": [str(padded_width)], + "$INIT_FILE$": [init_file], + "$RAM_STYLE$": [ram_style], + "$PUMPED_MEMORY$": [str(pumped_memory)], + } + # apply code generation to template + with template_path.open() as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + output_path = Path(code_gen_dir) / f"{mname}_memstream_wrapper.v" + with output_path.open("w") as f: + f.write(template_wrapper) diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 0f56733541..72befc53ea 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -58,6 +58,7 @@ def register_custom_op(cls): from finn.custom_op.fpgadataflow.rtl.inner_shuffle_rtl import InnerShuffle_rtl from finn.custom_op.fpgadataflow.rtl.layernorm_rtl import LayerNorm_rtl from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MVAU_rtl +from finn.custom_op.fpgadataflow.rtl.removedatapath_rtl import RemoveDataPath_rtl from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import ( StreamingDataWidthConverter_rtl, ) @@ -75,4 +76,5 @@ def register_custom_op(cls): custom_op["MVAU_rtl"] = MVAU_rtl custom_op["VVAU_rtl"] = VVAU_rtl custom_op["Thresholding_rtl"] = Thresholding_rtl +custom_op["RemoveDataPath_rtl"] = RemoveDataPath_rtl custom_op["InnerShuffle_rtl"] = InnerShuffle_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/removedatapath_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/removedatapath_rtl.py new file mode 100644 index 0000000000..aa7a7c0004 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/removedatapath_rtl.py @@ -0,0 +1,371 @@ +"""RTL implementation for RemoveDataPath custom operation. + +This module provides the RTL backend implementation for the RemoveDataPath +custom operation, which removes data from the datapath while maintaining +the control flow. +""" + +import numpy as np +import os +from collections.abc import Sequence +from numpy import ndarray +from numpy import typing as npt +from onnx import NodeProto +from pathlib import Path +from qonnx.core.datatype import BaseDataType, DataType +from typing import Any, cast + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.util.exception import FINNInternalError +from finn.util.logging import log + + +class RemoveDataPath_rtl(RTLBackend): + """RTL implementation for RemoveDataPath custom op.""" + + def __init__(self, onnx_node: NodeProto, **kwargs: Any) -> None: + """Initialize RemoveDataPath RTL backend. + + Args: + onnx_node: The ONNX node proto for this operation. + **kwargs: Additional keyword arguments passed to parent class. + + """ + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self) -> dict: + """Return node attribute types for this custom operation. + + Returns: + Dictionary mapping attribute names to their type specifications. + + """ + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + # folded shape of input/output + "folded_shape": ("ints", True, []), + # normal shape of input/output + "normal_shape": ("ints", True, []), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + } + ) + return my_attrs + + def infer_node_datatype(self, model: Any) -> None: + """Infer and set the output datatype based on input datatype. + + Args: + model: The model wrapper containing this node. + + """ + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + log.warning( + f"inputDataType changing for {node.name}: {self.get_input_datatype()} -> {idt}" + ) + self.set_nodeattr("dataType", idt.name) + # data type stays the same + model.set_tensor_datatype(node.output[0], idt) + + def get_rtl_file_list(self, abspath: bool = False) -> list[Path]: + """Return list of RTL files required for this custom operation. + + Args: + abspath: Whether to return absolute paths (default: False). + + Returns: + List of Path objects pointing to required RTL files. + + Raises: + FINNInternalError: If code_gen_dir_ipgen or gen_top_module attributes are invalid. + + """ + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if abspath else "" + + top_name = self.get_nodeattr("gen_top_module") + if type(code_gen_dir) is not str: + raise FINNInternalError( + f"code_gen_dir_ipgen attribute not set in {self.onnx_node.name}, " + "cannot get RTL file list" + ) + if type(top_name) is not str or top_name == "": + raise FINNInternalError( + f"gen_top_module attribute not set in {self.onnx_node.name}, " + "cannot get RTL file list" + ) + + code_gen_dir_path = Path(code_gen_dir) + + verilog_files = [ + code_gen_dir_path / f"{top_name}.v", + ] + return verilog_files + + def generate_hdl(self, model: Any, fpgapart: str, clk: str) -> None: # noqa: ARG002 + """Generate the RTL code for this custom op. + + Args: + model: The model wrapper containing this node (unused). + fpgapart: Target FPGA part string (unused). + clk: Clock period in nanoseconds (unused). + + Raises: + FINNInternalError: If code_gen_dir_ipgen attribute is invalid. + + """ + rtlsrc = Path(os.environ["FINN_RTLLIB"]) / "removedatapath" / "hdl" + template_path = rtlsrc / "dummy_template.v" + + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + topname = self.get_verilog_top_module_name() + self.set_nodeattr("gen_top_module", topname) + + # make instream width a multiple of 8 for axi interface + in_width = self.get_instream_width_padded() + + code_gen_dict = {"$TOP_MODULE_NAME$": topname, "$WIDTH$": str(in_width)} + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + if type(code_gen_dir) is not str or code_gen_dir == "": + raise FINNInternalError( + f"code_gen_dir_ipgen attribute not set in {topname}, cannot generate RTL code" + ) + with Path.open(template_path) as f: + template = f.read() + + for placeholder, value in code_gen_dict.items(): + template = template.replace(placeholder, value) + + output_path = Path(code_gen_dir) / f"{self.get_verilog_top_module_name()}.v" + with Path.open(output_path, "w") as f: + f.write(template) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + # i.e. during the HLSSynthIP() transformation + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def code_generation_ipi(self) -> list[str]: + """Code generation for IP integration.""" + sourcefiles = self.get_rtl_file_list(abspath=True) + + cmd = [] + for f in sourcefiles: + cmd += [f"add_files -norecurse {f}"] + cmd += [ + "create_bd_cell -type module -reference " + f"{self.get_nodeattr('gen_top_module')} {self.onnx_node.name}" + ] + return cmd + + def get_normal_input_shape( + self, ind: int = 0 # noqa: ARG002 + ) -> Sequence[int] | npt.NDArray[np.int_]: + """Return the normal (unfolded) input shape. + + Args: + ind: Input index (unused, kept for interface compatibility). + + Returns: + The normal input shape dimensions. + + Raises: + FINNInternalError: If normal_shape attribute is invalid or empty. + + """ + normal_shape = self.get_nodeattr("normal_shape") + if ( + type(normal_shape) is not list + and type(normal_shape) is not tuple + and not isinstance(normal_shape, ndarray) + ): + raise FINNInternalError( + f"normal_shape attribute not set correctly in {self.onnx_node.name}, " + "cannot get normal input shape" + ) + if len(normal_shape) == 0: + raise FINNInternalError( + f"normal_shape attribute is empty in {self.onnx_node.name}, " + "cannot get normal input shape" + ) + if type(normal_shape[0]) is not int: + raise FINNInternalError( + f"normal_shape attribute not set correctly in {self.onnx_node.name}, " + "cannot get normal input shape" + ) + return normal_shape + + def get_normal_output_shape( + self, ind: int = 0 # noqa: ARG002 + ) -> Sequence[int] | npt.NDArray[np.int_]: + """Return the normal (unfolded) output shape. + + Args: + ind: Output index (unused, kept for interface compatibility). + + Returns: + Tuple containing the normal output shape dimensions. + + """ + return self.get_normal_input_shape() + + def get_folded_input_shape( + self, ind: int = 0 # noqa: ARG002 + ) -> Sequence[int] | npt.NDArray[np.int_]: + """Return the folded input shape. + + Args: + ind: Input index (unused, kept for interface compatibility). + + Returns: + Tuple containing the folded input shape dimensions. + + """ + folded_shape = self.get_nodeattr("folded_shape") + if ( + type(folded_shape) is not list + and type(folded_shape) is not tuple + and not isinstance(folded_shape, ndarray) + ): + raise FINNInternalError( + f"folded_shape attribute not set correctly in {self.onnx_node.name}, " + "cannot get folded input shape" + ) + if len(folded_shape) == 0: + raise FINNInternalError( + f"folded_shape attribute is empty in {self.onnx_node.name}, " + "cannot get folded input shape" + ) + if type(folded_shape[0]) is not int: + raise FINNInternalError( + f"folded_shape attribute not set correctly in {self.onnx_node.name}, " + "cannot get folded input shape" + ) + return cast("Sequence[int]", folded_shape) + + def get_folded_output_shape( + self, ind: int = 0 # noqa: ARG002 + ) -> Sequence[int] | npt.NDArray[np.int_]: + """Return the folded output shape. + + Args: + ind: Output index (unused, kept for interface compatibility). + + Returns: + Tuple containing the folded output shape dimensions. + + """ + return self.get_folded_input_shape() + + def get_instream_width(self, ind: int = 0) -> int: # noqa: ARG002 + """Return the input stream width in bits. + + Args: + ind: Input index (unused, kept for interface compatibility). + + Returns: + Input stream width in bits. + + """ + dtype = self.get_nodeattr("dataType") + if type(dtype) is not str: + raise FINNInternalError( + f"dataType attribute not set correctly in {self.onnx_node.name}, " + "cannot get instream width" + ) + dtype = DataType[dtype] + folded_shape = self.get_nodeattr("folded_shape") + if ( + type(folded_shape) is not list + and type(folded_shape) is not tuple + and not isinstance(folded_shape, ndarray) + ): + raise FINNInternalError( + f"folded_shape attribute not set correctly in {self.onnx_node.name}, " + "cannot get outstream width" + ) + in_width = folded_shape[-1] * dtype.bitwidth() + return in_width + + def get_outstream_width(self, ind: int = 0) -> int: # noqa: ARG002 + """Return the output stream width in bits. + + Args: + ind: Output index (unused, kept for interface compatibility). + + Returns: + Output stream width in bits. + + Raises: + FINNInternalError: If dataType or folded_shape attributes are invalid. + + """ + dtype = self.get_nodeattr("dataType") + if type(dtype) is not str: + raise FINNInternalError( + f"dataType attribute not set correctly in {self.onnx_node.name}, " + "cannot get outstream width" + ) + dtype = DataType[dtype] + folded_shape = self.get_nodeattr("folded_shape") + if ( + type(folded_shape) is not list + and type(folded_shape) is not tuple + and not isinstance(folded_shape, ndarray) + ): + raise FINNInternalError( + f"folded_shape attribute not set correctly in {self.onnx_node.name}, " + "cannot get outstream width" + ) + in_width = folded_shape[-1] * dtype.bitwidth() + return in_width + + def get_input_datatype(self, ind: int = 0) -> BaseDataType: # noqa: ARG002 + """Return the input data type. + + Args: + ind: Input index (unused, kept for interface compatibility). + + Returns: + The QONNX data type for the input. + + Raises: + FINNInternalError: If dataType attribute is invalid. + + """ + dtype = self.get_nodeattr("dataType") + if type(dtype) is not str: + raise FINNInternalError( + f"dataType attribute not set correctly in {self.onnx_node.name}, " + "cannot get outstream width" + ) + dtype = DataType[dtype] + return dtype + + def get_output_datatype(self, ind: int = 0) -> BaseDataType: # noqa: ARG002 + """Return the output data type. + + Args: + ind: Output index (unused, kept for interface compatibility). + + Returns: + The QONNX data type for the output. + + Raises: + FINNInternalError: If dataType attribute is invalid. + + """ + dtype = self.get_nodeattr("dataType") + if type(dtype) is not str: + raise FINNInternalError( + f"dataType attribute not set correctly in {self.onnx_node.name}, " + "cannot get outstream width" + ) + dtype = DataType[dtype] + return dtype diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py index 96d3d9f116..d8a5d210b9 100644 --- a/src/finn/custom_op/fpgadataflow/rtlbackend.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -26,76 +26,174 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +"""RTL backend support for FINN custom operations. + +This module provides the RTLBackend abstract base class that all RTL-based custom +operations in FINN inherit from. It includes functionality for HDL code generation, +RTL simulation, and integration with Vivado IP Integrator. +""" + import numpy as np -import os +import numpy.typing as npt from abc import ABC, abstractmethod +from pathlib import Path +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + from onnx import GraphProto + from qonnx.core.modelwrapper import ModelWrapper +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn import xsi from finn.util.basic import make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.util.exception import FINNInternalError from finn.util.logging import log finnxsi = xsi if xsi.is_available() else None -class RTLBackend(ABC): +class RTLBackend(HWCustomOp, ABC): """RTLBackend class all custom ops that correspond to a module in finn-rtllib are using functionality of. Contains different functions every RTL custom node should have. Some as abstract methods, these have to be filled when writing a new RTL custom op node.""" - def get_nodeattr_types(self): - return { - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), - } + def get_nodeattr_types( + self, + ) -> dict[ + str, + tuple[str, bool, int | float | str | bool | npt.NDArray | list] + | tuple[str, bool, int | float | str | bool | npt.NDArray | list, set | None], + ]: + """Return 4-tuple (dtype, required, default_val, allowed_values) for attribute + with name. allowed_values will be None if not specified. + + Returns: + dict[ str, tuple[str, bool, int | float | str | bool | npt.NDArray | list] | tuple[ + str, bool, int | float | str | bool | npt.NDArray | list, set | None]]: + Dictionary of node attribute types + """ + super_attrs = super().get_nodeattr_types() + super_attrs.update( + { + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), + } + ) + return super_attrs @abstractmethod - def generate_hdl(self, model, fpgapart, clk): - pass + def generate_hdl(self, model: "ModelWrapper", fpgapart: str, clk: str) -> None: + """Generate HDL code for this node. + + Args: + model: The FINN model containing this node + fpgapart: Target FPGA part string + clk: Clock period specification - def prepare_rtlsim(self, behav=False): - """Creates a xsi emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path.""" + Returns: + None + """ + + def prepare_rtlsim(self) -> None: + """Create a xsi emulation library for the RTL code generated for this node. + Sets the rtlsim_so attribute to the path of the generated library. + + Returns: + None + """ + import finn_xsi.adapter as finnxsi verilog_files = self.get_rtl_file_list(abspath=True) single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") trace_file = self.get_nodeattr("rtlsim_trace") debug = not (trace_file is None or trace_file == "") ret = finnxsi.compile_sim_obj( - self.get_verilog_top_module_name(), verilog_files, single_src_dir, debug, behav + self.get_verilog_top_module_name(), verilog_files, single_src_dir, debug ) # save generated lib filename in attribute self.set_nodeattr("rtlsim_so", ret[0] + "/" + ret[1]) - def get_verilog_paths(self): - """Returns path to code gen directory. Can be overwritten to - return additional paths to relevant verilog files""" + def get_verilog_paths(self) -> list[str]: + """Return path to code gen directory. + Can be overwritten to return additional paths to relevant verilog files. + + Returns: + list[str]: List of paths to directories containing Verilog files + """ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - return [code_gen_dir] + return [cast("str", code_gen_dir)] @abstractmethod - def get_rtl_file_list(self, abspath=False): - """Returns list of rtl files. Needs to be filled by each node.""" - pass + def get_rtl_file_list(self, abspath: bool = False) -> list[str] | list[Path]: + """Return list of RTL files. + Must be implemented by each subclass to provide the list of RTL files used by this node. + + Args: + abspath: If True, return absolute paths; if False, return relative paths + + Returns: + list[str] | list[Path]: List of paths to RTL files + """ @abstractmethod - def code_generation_ipi(self): - pass + def code_generation_ipi(self) -> list[str]: + """Generate TCL commands for IP Integrator. + Must be implemented by each subclass to provide the TCL commands needed + to integrate this node into Vivado IP Integrator. + + Returns: + list[str]: List of TCL commands for IP Integrator + """ - def code_generation_ipgen(self, model, fpgapart, clk): + def code_generation_ipgen(self, model: "ModelWrapper", fpgapart: str, clk: str) -> None: + """Generate HDL code for IP generation. + Wrapper method that calls generate_hdl to produce the HDL code for this node. + + Args: + model: The FINN model containing this node + fpgapart: Target FPGA part string + clk: Clock period specification + + Returns: + None + """ self.generate_hdl(model, fpgapart, clk) - def execute_node(self, context, graph): + def execute_node( + self, context: dict[str, npt.NDArray], graph: "GraphProto" + ) -> None: # noqa: ARG002 + """Execute this node's RTL simulation. + + Args: + context: Dictionary mapping tensor names to their numpy array values + graph: The ONNX graph containing this node + + Returns: + None + + Raises: + Exception: If exec_mode is not set to "rtlsim" + """ mode = self.get_nodeattr("exec_mode") - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + code_gen_dir = cast("str", self.get_nodeattr("code_gen_dir_ipgen")) if mode == "rtlsim": node = self.onnx_node inputs = {} for i, inp in enumerate(node.input): - exp_ishape = tuple(self.get_normal_input_shape(i)) + shape = self.get_normal_input_shape(i) + if shape is None: + raise FINNInternalError( + f"Input shape for input {i} of node {node.name} is None." + ) + exp_ishape = tuple(shape) folded_ishape = self.get_folded_input_shape(i) + if folded_ishape is None: + raise FINNInternalError( + f"Folded input shape for input {i} of node {node.name} is None." + ) inp_val = context[inp] # Make sure the input has the right container datatype if inp_val.dtype != np.float32: @@ -112,15 +210,14 @@ def execute_node(self, context, graph): export_idt = self.get_input_datatype(i) reshaped_input = inp_val.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_%s.npy" % i), reshaped_input) + input_path = Path(code_gen_dir) / f"input_{i}.npy" + np.save(input_path, reshaped_input) nbits = self.get_instream_width(i) - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_{}.npy".format(code_gen_dir, i), export_idt, nbits - ) - inputs["in%s" % i] = rtlsim_inp + rtlsim_inp = npy_to_rtlsim_input(str(input_path), export_idt, nbits) + inputs[f"in{i}"] = rtlsim_inp outputs = {} - for o, outp in enumerate(node.output): - outputs["out%s" % o] = [] + for o, _ in enumerate(node.output): + outputs[f"out{o}"] = [] # assembled execution context io_dict = {"inputs": inputs, "outputs": outputs} @@ -129,17 +226,22 @@ def execute_node(self, context, graph): self.rtlsim_multi_io(sim, io_dict) self.close_rtlsim(sim) for o, outp in enumerate(node.output): - rtlsim_output = io_dict["outputs"]["out%s" % o] + rtlsim_output = io_dict["outputs"][f"out{o}"] odt = self.get_output_datatype(o) target_bits = odt.bitwidth() packed_bits = self.get_outstream_width(o) - out_npy_path = "{}/output.npy".format(code_gen_dir) + out_npy_path = f"{code_gen_dir}/output.npy" out_shape = self.get_folded_output_shape(o) rtlsim_output_to_npy( rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits ) # load and reshape output - exp_oshape = tuple(self.get_normal_output_shape(o)) + oshape = self.get_normal_output_shape(o) + if oshape is None: + raise FINNInternalError( + f"Output shape for output {o} of node {node.name} is None." + ) + exp_oshape = tuple(oshape) output = np.load(out_npy_path) output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) context[outp] = output @@ -150,8 +252,6 @@ def execute_node(self, context, graph): else: raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) + f"""Invalid value for attribute exec_mode! Is currently set to: {mode} + has to be set to one of the following value ("cppsim", "rtlsim")""" ) diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 026d5794e2..7eb2f4506e 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -37,11 +37,12 @@ from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.custom_op.fpgadataflow.memstream import MemStreamSupport from finn.util.exception import FINNInternalError from finn.util.logging import log -class Thresholding(HWCustomOp): +class Thresholding(MemStreamSupport, HWCustomOp): """Abstraction layer for HW implementation of Thresholding.""" def __init__(self, onnx_node, **kwargs): diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index ec31c68330..2bef9bfc65 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -48,12 +48,13 @@ ) from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.custom_op.fpgadataflow.memstream import MemStreamSupport from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string from finn.util.logging import log from finn.util.settings import get_settings -class VVAU(HWCustomOp): +class VVAU(MemStreamSupport, HWCustomOp): """Abstraction layer for HW implementation of VectorVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py index 0e6145063b..85708f3d11 100644 --- a/src/finn/interface/run_finn.py +++ b/src/finn/interface/run_finn.py @@ -934,7 +934,7 @@ def bench( # Late import because we need prepare_finn to setup remaining dependencies first from finn.benchmarking.bench import start_bench_run - exit_code = start_bench_run(bench_config) + exit_code = start_bench_run(str(bench_config)) sys.exit(exit_code) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 52729919d6..2d170ee84b 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -1,3 +1,9 @@ +"""Create stitched IP from FINN dataflow graph. + +This module provides transformations to create a Vivado IP Block Design project +from generated IPs in a FINN dataflow graph. +""" + # Copyright (c) 2020, Xilinx, Inc. # Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. @@ -27,31 +33,37 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -"""Transformation to create stitched IP from dataflow graph components.""" - import json import multiprocessing as mp import os +from pathlib import Path +from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.util.basic import get_num_default_workers from shutil import copytree from subprocess import CalledProcessError +from typing import TYPE_CHECKING, Literal, cast + +if TYPE_CHECKING: + from onnx import NodeProto +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.templates import get_templates_folder from finn.transformation.fpgadataflow.replace_verilog_relpaths import ReplaceVerilogRelPaths from finn.util.basic import launch_process_helper, make_build_dir -from finn.util.exception import FINNError, FINNUserError +from finn.util.exception import FINNInternalError, FINNUserError from finn.util.fpgadataflow import is_hls_node, is_rtl_node from finn.util.logging import log -def is_external_input(model, node, i): - """ - Determine whether input i of node should be made external. +def is_external_input(model: ModelWrapper, node: "NodeProto", i: int) -> bool: + """Check if input i of node should be made external. - True only if input is unconnected and has no initializer. - Only exception is second input of FC layers when mem_mode is external. + Returns True only if input is unconnected and has no initializer. + Exception: second input of FC layers when mem_mode is external. """ node_inst = getCustomOp(node) op_type = node.op_type @@ -59,21 +71,19 @@ def is_external_input(model, node, i): if producer is None: if model.get_initializer(node.input[i]) is None: return True - else: - if op_type.startswith("MVAU"): - if node_inst.get_nodeattr("mem_mode") == "external": - return True + if op_type.startswith("MVAU") and node_inst.get_nodeattr("mem_mode") == "external": + return True return False -def is_external_output(model, node, i): - """Determine whether output i of node should be made external.""" +def is_external_output(model: ModelWrapper, node: "NodeProto", i: int) -> bool: + """Check if output i of node should be made external. + + Returns True only if output is unconnected. + """ + # TODO should ideally check if tensor is in top-level outputs consumers = model.find_consumers(node.output[i]) - if consumers == []: - # TODO should ideally check if tensor is in top-level - # outputs - return True - return False + return consumers == [] class CreateStitchedIP(Transformation): @@ -90,14 +100,34 @@ class CreateStitchedIP(Transformation): The packaged block design IP can be found under the ip subdirectory. """ - def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signature=[]): - """Initialize CreateStitchedIP transformation with FPGA part and clock settings.""" + def __init__( + self, + fpgapart: str, + clk_ns: float, + ip_name: str = "finn_design", + vitis: bool = False, + signature: list | None = None, + functional_simulation: bool = False, + ) -> None: + """Initialize CreateStitchedIP transformation. + + Args: + fpgapart: FPGA part identifier + clk_ns: Clock period in nanoseconds + ip_name: Name for the IP design + vitis: Whether to target Vitis + signature: Optional signature list [customer, application, version] + functional_simulation: Whether to generate functional simulation wrapper + """ + if signature is None: + signature = [] super().__init__() self.fpgapart = fpgapart self.clk_ns = clk_ns self.ip_name = ip_name self.vitis = vitis self.signature = signature + self.functional_simulation = functional_simulation self.has_aximm = False self.has_m_axis = False self.m_axis_idx = 0 @@ -118,208 +148,221 @@ def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signatu "ap_none": [], } - def is_double_pumped(self, node): - """Check if node uses double pumped computation.""" + def is_double_pumped(self, node: "NodeProto") -> bool: + """Check if node uses double-pumped compute or memory.""" if node.op_type.startswith("MVAU"): inst = getCustomOp(node) try: - pumped_compute = inst.get_nodeattr("pumpedCompute") + pumped_compute = cast("int", inst.get_nodeattr("pumpedCompute")) except AttributeError: pumped_compute = 0 - return pumped_compute or inst.get_nodeattr("pumpedMemory") + return bool(pumped_compute or cast("int", inst.get_nodeattr("pumpedMemory"))) + return False - def connect_clk_rst(self, node): - """Connect clock and reset signals for the node.""" + def connect_clk_rst(self, node: "NodeProto") -> None: + """Connect clock and reset signals for a node.""" inst_name = node.name node_inst = getCustomOp(node) + if not isinstance(node_inst, HWCustomOp): + raise FINNInternalError( + f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces." + ) clock_intf_name = node_inst.get_verilog_top_module_intf_names()["clk"][0] reset_intf_name = node_inst.get_verilog_top_module_intf_names()["rst"][0] + # make clock and reset external, if they aren't already if not self.clock_reset_are_external: - self.connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock_intf_name) - ) - self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]") - self.connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, reset_intf_name) + self.connect_cmds.extend( + [ + f"make_bd_pins_external [get_bd_pins {inst_name}/{clock_intf_name}]", + "set_property name ap_clk [get_bd_ports ap_clk_0]", + f"make_bd_pins_external [get_bd_pins {inst_name}/{reset_intf_name}]", + "set_property name ap_rst_n [get_bd_ports ap_rst_n_0]", + ] ) - self.connect_cmds.append("set_property name ap_rst_n [get_bd_ports ap_rst_n_0]") self.clock_reset_are_external = True self.intf_names["clk"] = ["ap_clk"] self.intf_names["rst"] = ["ap_rst_n"] # otherwise connect clock and reset else: - self.connect_cmds.append( - "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]" - % (inst_name, reset_intf_name) - ) - self.connect_cmds.append( - "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]" - % (inst_name, clock_intf_name) + self.connect_cmds.extend( + [ + f"connect_bd_net [get_bd_ports ap_rst_n] " + f"[get_bd_pins {inst_name}/{reset_intf_name}]", + f"connect_bd_net [get_bd_ports ap_clk] " + f"[get_bd_pins {inst_name}/{clock_intf_name}]", + ] ) + # make clk2x external, if it isn't already and connect clk2x if self.is_double_pumped(node): clock2x_intf_name = node_inst.get_verilog_top_module_intf_names()["clk2x"][0] if not self.clock2x_is_external: - self.connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock2x_intf_name) + self.connect_cmds.extend( + [ + f"make_bd_pins_external [get_bd_pins {inst_name}/{clock2x_intf_name}]", + "set_property name ap_clk2x [get_bd_ports ap_clk2x_0]", + ] ) - self.connect_cmds.append("set_property name ap_clk2x [get_bd_ports ap_clk2x_0]") self.clock2x_is_external = True self.intf_names["clk2x"] = ["ap_clk2x"] # otherwise connect clk2x else: if self.is_double_pumped(node): self.connect_cmds.append( - "connect_bd_net [get_bd_ports ap_clk2x] [get_bd_pins %s/%s]" - % (inst_name, clock2x_intf_name) + f"connect_bd_net [get_bd_ports ap_clk2x] " + f"[get_bd_pins {inst_name}/{clock2x_intf_name}]" ) - def connect_axi(self, node): - """Connect AXI interfaces for the node.""" + def connect_axi(self, node: "NodeProto") -> None: + """Connect AXI-Lite and AXI-MM interfaces for a node.""" inst_name = node.name node_inst = getCustomOp(node) + if not isinstance(node_inst, HWCustomOp): + raise FINNInternalError( + f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces." + ) axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"] aximm_intf_name = node_inst.get_verilog_top_module_intf_names()["aximm"] + if len(axilite_intf_name) != 0: self.connect_cmds.append( - "make_bd_intf_pins_external " - "[get_bd_intf_pins %s/%s]" % (inst_name, axilite_intf_name[0]) - ) - ext_if_name = "%s_%d" % ( - axilite_intf_name[0], - len(self.intf_names["axilite"]), + f"make_bd_intf_pins_external [get_bd_intf_pins {inst_name}/{axilite_intf_name[0]}]" ) + ext_if_name = f"{axilite_intf_name[0]}_{len(self.intf_names['axilite'])}" self.intf_names["axilite"].append(ext_if_name) + if len(aximm_intf_name) != 0: - self.connect_cmds.append( - "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" - % (inst_name, aximm_intf_name[0][0]) - ) - ext_if_name = "m_axi_gmem%d" % (len(self.intf_names["aximm"])) - self.connect_cmds.append( - "set_property name %s [get_bd_intf_ports m_axi_gmem_0]" % ext_if_name + ext_if_name = f"m_axi_gmem{len(self.intf_names['aximm'])}" + seg_name = f"{inst_name}/Data_m_axi_gmem/SEG_{ext_if_name}_Reg" + + self.connect_cmds.extend( + [ + f"make_bd_intf_pins_external " + f"[get_bd_intf_pins {inst_name}/{aximm_intf_name[0][0]}]", + f"set_property name {ext_if_name} [get_bd_intf_ports m_axi_gmem_0]", + "assign_bd_address", + f"set_property offset 0 [get_bd_addr_segs {{{seg_name}}}]", + f"set_property range 4G [get_bd_addr_segs {{{seg_name}}}]", + ] ) - self.connect_cmds.append("assign_bd_address") - seg_name = "%s/Data_m_axi_gmem/SEG_%s_Reg" % (inst_name, ext_if_name) - self.connect_cmds.append("set_property offset 0 [get_bd_addr_segs {%s}]" % (seg_name)) - # TODO should propagate this information from the node instead of 4G - self.connect_cmds.append("set_property range 4G [get_bd_addr_segs {%s}]" % (seg_name)) + self.intf_names["aximm"] = [(ext_if_name, aximm_intf_name[0][1])] self.has_aximm = True - def connect_m_axis_external(self, node, idx=None): - """Connect master AXI stream interfaces as external ports.""" + def connect_m_axis_external(self, node: "NodeProto", idx: int | None = None) -> None: + """Make AXI Stream master interface(s) external.""" inst_name = node.name node_inst = getCustomOp(node) + if not isinstance(node_inst, HWCustomOp): + raise FINNInternalError( + f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces." + ) output_intf_names = node_inst.get_verilog_top_module_intf_names()["m_axis"] + # make output axis external for i in range(len(output_intf_names)): if idx is not None and idx != i: continue output_intf_name = output_intf_names[i][0] - self.connect_cmds.append( - "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" - % (inst_name, output_intf_name) - ) - self.connect_cmds.append( - "set_property name m_axis_%d [get_bd_intf_ports %s_0]" - % (self.m_axis_idx, output_intf_name) + + self.connect_cmds.extend( + [ + f"make_bd_intf_pins_external [get_bd_intf_pins {inst_name}/{output_intf_name}]", + f"set_property name m_axis_{self.m_axis_idx} " + f"[get_bd_intf_ports {output_intf_name}_0]", + ] ) + self.has_m_axis = True - self.intf_names["m_axis"].append( - ("m_axis_%d" % self.m_axis_idx, output_intf_names[i][1]) - ) + self.intf_names["m_axis"].append((f"m_axis_{self.m_axis_idx}", output_intf_names[i][1])) self.m_axis_idx += 1 - def connect_s_axis_external(self, node, idx=None): - """Connect slave AXI stream interfaces as external ports.""" + def connect_s_axis_external(self, node: "NodeProto", idx: int | None = None) -> None: + """Make AXI Stream slave interface(s) external.""" inst_name = node.name node_inst = getCustomOp(node) + if not isinstance(node_inst, HWCustomOp): + raise FINNInternalError( + f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces." + ) input_intf_names = node_inst.get_verilog_top_module_intf_names()["s_axis"] + # make input axis external for i in range(len(input_intf_names)): if idx is not None and idx != i: continue input_intf_name = input_intf_names[i][0] - self.connect_cmds.append( - "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" % (inst_name, input_intf_name) - ) - self.connect_cmds.append( - "set_property name s_axis_%d [get_bd_intf_ports %s_0]" - % (self.s_axis_idx, input_intf_name) + + self.connect_cmds.extend( + [ + f"make_bd_intf_pins_external [get_bd_intf_pins {inst_name}/{input_intf_name}]", + f"set_property name s_axis_{self.s_axis_idx} " + f"[get_bd_intf_ports {input_intf_name}_0]", + ] ) + self.has_s_axis = True - self.intf_names["s_axis"].append( - ("s_axis_%d" % self.s_axis_idx, input_intf_names[i][1]) - ) + self.intf_names["s_axis"].append((f"s_axis_{self.s_axis_idx}", input_intf_names[i][1])) self.s_axis_idx += 1 - def connect_ap_none_external(self, node): - """Connect ap_none interfaces as external ports.""" + def connect_ap_none_external(self, node: "NodeProto") -> None: + """Make ap_none interfaces external.""" inst_name = node.name node_inst = getCustomOp(node) + if not isinstance(node_inst, HWCustomOp): + raise FINNInternalError( + f"Node {node.name} is not an HWCustomOp, cannot connect AXI interfaces." + ) input_intf_names = node_inst.get_verilog_top_module_intf_names()["ap_none"] + # make external for i in range(len(input_intf_names)): input_intf_name = input_intf_names[i] - self.connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, input_intf_name) - ) - self.connect_cmds.append( - "set_property name %s [get_bd_ports %s_0]" % (input_intf_name, input_intf_name) + self.connect_cmds.extend( + [ + f"make_bd_pins_external [get_bd_pins {inst_name}/{input_intf_name}]", + f"set_property name {input_intf_name} [get_bd_ports {input_intf_name}_0]", + ] ) self.intf_names["ap_none"].append(input_intf_name) - def insert_signature(self, checksum_count): - """Insert signature block for design identification.""" + def insert_signature(self, checksum_count: int) -> None: + """Insert AXI info signature component into the design.""" signature_vlnv = "AMD:user:axi_info_top:1.0" signature_name = "axi_info_top0" - self.create_cmds.append( - "create_bd_cell -type ip -vlnv %s %s" % (signature_vlnv, signature_name) - ) - self.create_cmds.append( - "set_property -dict [list " - "CONFIG.SIG_CUSTOMER {%s} " - "CONFIG.SIG_APPLICATION {%s} " - "CONFIG.VERSION {%s} " - "CONFIG.CHECKSUM_COUNT {%s} " - "] [get_bd_cells %s]" - % ( - self.signature[0], - self.signature[1], - self.signature[2], - checksum_count, - signature_name, - ) - ) - # set clk and reset - self.connect_cmds.append( - "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/ap_clk]" % signature_name - ) - self.connect_cmds.append( - "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/ap_rst_n]" % signature_name - ) fclk_mhz = 1 / (self.clk_ns * 0.001) fclk_hz = fclk_mhz * 1000000 - self.connect_cmds.append( - "set_property -dict [list " - "CONFIG.FREQ_HZ {%f} " - "CONFIG.CLK_DOMAIN {ap_clk} " - "] [get_bd_intf_pins %s/s_axi]" - % ( - fclk_hz, - signature_name, - ) + + # Create signature cell and configure properties + self.create_cmds.extend( + [ + f"create_bd_cell -type ip -vlnv {signature_vlnv} {signature_name}", + f"set_property -dict [list " + f"CONFIG.SIG_CUSTOMER {{{self.signature[0]}}} " + f"CONFIG.SIG_APPLICATION {{{self.signature[1]}}} " + f"CONFIG.VERSION {{{self.signature[2]}}} " + f"CONFIG.CHECKSUM_COUNT {{{checksum_count}}} " + f"] [get_bd_cells {signature_name}]", + ] ) - # make axilite interface external - self.connect_cmds.append( - "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name + + # Connect clocks, resets and configure AXI interface + self.connect_cmds.extend( + [ + f"connect_bd_net [get_bd_ports ap_clk] [get_bd_pins {signature_name}/ap_clk]", + f"connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins {signature_name}/ap_rst_n]", + f"set_property -dict [list " + f"CONFIG.FREQ_HZ {{{fclk_hz}}} " + f"CONFIG.CLK_DOMAIN {{ap_clk}} " + f"] [get_bd_intf_pins {signature_name}/s_axi]", + f"make_bd_intf_pins_external [get_bd_intf_pins {signature_name}/s_axi]", + "set_property name s_axilite_info [get_bd_intf_ports s_axi_0]", + "assign_bd_address", + ] ) - self.connect_cmds.append("set_property name s_axilite_info [get_bd_intf_ports s_axi_0]") - self.connect_cmds.append("assign_bd_address") - def apply(self, model): + def apply(self, model: "ModelWrapper") -> tuple[ModelWrapper, Literal[False]]: """Apply the CreateStitchedIP transformation to the model.""" # ensure non-relative readmemh .dat files model = model.transform(ReplaceVerilogRelPaths()) @@ -328,7 +371,10 @@ def apply(self, model): ip_dirs.append("$::env(FINN_RTLLIB)/memstream") if self.signature: ip_dirs.append("$::env(FINN_RTLLIB)/axi_info") - if model.graph.node[0].op_type not in ["StreamingFIFO_rtl", "IODMA_hls"]: + if ( + model.graph.node[0].op_type not in ["StreamingFIFO_rtl", "IODMA_hls"] + and self.functional_simulation is False + ): log.warning( """First node is not StreamingFIFO or IODMA. You may experience incorrect stitched-IP rtlsim or hardware @@ -345,12 +391,23 @@ def apply(self, model): ) for node in model.graph.node: # ensure that all nodes are fpgadataflow, and that IPs are generated - assert is_hls_node(node) or is_rtl_node( - node - ), "All nodes must be FINN fpgadataflow nodes." + if not is_hls_node(node) and not is_rtl_node(node): + raise FINNUserError( + f"{node.name} is not an fpgadataflow node. Aborting stitching IP." + ) node_inst = getCustomOp(node) + if not isinstance(node_inst, RTLBackend) and not isinstance(node_inst, HLSBackend): + raise FINNInternalError( + f"Node {node.name} is not an RTL Node or HLS Node, " + "cannot connect AXI interfaces." + ) ip_dir_value = node_inst.get_nodeattr("ip_path") - assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist." + if type(ip_dir_value) is not str or ip_dir_value == "": + raise FINNInternalError(f"ip_path has the wrong type in node {node.name}.") + if not Path(ip_dir_value).is_dir(): + raise FINNInternalError( + f"IP generation directory doesn't exist in node {node.name}." + ) ip_dirs += [ip_dir_value] self.create_cmds += node_inst.code_generation_ipi() self.connect_clk_rst(node) @@ -362,22 +419,25 @@ def apply(self, model): if producer is None: continue j = list(producer.output).index(node.input[i]) - src_intf_name = getCustomOp(producer).get_verilog_top_module_intf_names()[ - "m_axis" - ][j][0] + prod = getCustomOp(producer) + if not isinstance(prod, HWCustomOp): + raise FINNInternalError( + f"Producer node {producer.name} is not an HWCustomOp, " + "cannot connect AXI interfaces." + ) + src_intf_name = prod.get_verilog_top_module_intf_names()["m_axis"][j][0] dst_intf_name = node_inst.get_verilog_top_module_intf_names()["s_axis"][i][0] self.connect_cmds.append( - "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins %s/%s]" - % (producer.name, src_intf_name, node.name, dst_intf_name) + f"connect_bd_intf_net [get_bd_intf_pins {producer.name}/{src_intf_name}] " + f"[get_bd_intf_pins {node.name}/{dst_intf_name}]" ) # process external inputs and outputs in top-level graph input order - for input in model.graph.input: - inp_name = input.name + for graph_input in model.graph.input: + inp_name = graph_input.name inp_cons = model.find_consumers(inp_name) - assert inp_cons != [], "No consumer for input " + inp_name - assert len(inp_cons) == 1, "Multiple consumers for input " + inp_name + assert inp_cons != [], f"No consumer for input {inp_name}" + assert len(inp_cons) == 1, f"Multiple consumers for input {inp_name}" node = inp_cons[0] node_inst = getCustomOp(node) for i in range(len(node.input)): @@ -386,7 +446,7 @@ def apply(self, model): for output in model.graph.output: out_name = output.name node = model.find_producer(out_name) - assert node is not None, "No producer for output " + out_name + assert node is not None, f"No producer for output {out_name}" node_inst = getCustomOp(node) for i in range(len(node.output)): if node.output[i] == out_name: @@ -403,147 +463,171 @@ def apply(self, model): model.set_metadata_prop("vivado_stitch_proj", vivado_stitch_proj_dir) # start building the tcl script tcl = [] - # create vivado project - tcl.append( - "create_project %s %s -part %s" % (prjname, vivado_stitch_proj_dir, self.fpgapart) - ) - # no warnings on long module names - tcl.append("set_msg_config -id {[BD 41-1753]} -suppress") - # add all the generated IP dirs to ip_repo_paths + + # Project setup ip_dirs_str = " ".join(ip_dirs) - tcl.append("set_property ip_repo_paths [%s] [current_project]" % ip_dirs_str) - tcl.append("update_ip_catalog") - # create block design and instantiate all layers block_name = self.ip_name - tcl.append('create_bd_design "%s"' % block_name) + + tcl.extend( + [ + f"create_project {prjname} {vivado_stitch_proj_dir} -part {self.fpgapart}", + "set_msg_config -id {[BD 41-1753]} -suppress", + f"set_property ip_repo_paths [{ip_dirs_str}] [current_project]", + "update_ip_catalog", + f'create_bd_design "{block_name}"', + ] + ) + # Add commands and validate design tcl.extend(self.create_cmds) tcl.extend(self.connect_cmds) + fclk_mhz = 1 / (self.clk_ns * 0.001) fclk_hz = fclk_mhz * 1000000 - tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz)) + + # Configure clocks and validate design + clock_config = [f"set_property CONFIG.FREQ_HZ {round(fclk_hz)} [get_bd_ports /ap_clk]"] if self.clock2x_is_external: - tcl.append( - "set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk2x]" % round(2 * fclk_hz) + clock_config.append( + f"set_property CONFIG.FREQ_HZ {round(2 * fclk_hz)} [get_bd_ports /ap_clk2x]" ) - tcl.append("validate_bd_design") - tcl.append("save_bd_design") - # create wrapper hdl (for rtlsim later on) - bd_base = "%s/%s.srcs/sources_1/bd/%s" % ( - vivado_stitch_proj_dir, - prjname, - block_name, + + clock_config.extend(["validate_bd_design", "save_bd_design"]) + + tcl.extend(clock_config) + + # Create wrapper HDL + bd_base = f"{vivado_stitch_proj_dir}/{prjname}.srcs/sources_1/bd/{block_name}" + bd_filename = f"{bd_base}/{block_name}.bd" + wrapper_filename = f"{bd_base}/hdl/{block_name}_wrapper.v" + + tcl.extend( + [ + f"make_wrapper -files [get_files {bd_filename}] -top", + f"add_files -norecurse {wrapper_filename}", + f"set_property top {block_name}_wrapper [current_fileset]", + ] ) - bd_filename = "%s/%s.bd" % (bd_base, block_name) - tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename) - wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name) - tcl.append("add_files -norecurse %s" % wrapper_filename) + model.set_metadata_prop("wrapper_filename", wrapper_filename) - tcl.append("set_property top %s_wrapper [current_fileset]" % block_name) - # synthesize to DCP and export stub, DCP and constraints + num_workers = get_num_default_workers() + assert num_workers >= 0, "Number of workers must be nonnegative." + if num_workers == 0: + num_workers = mp.cpu_count() + + fifosim_wrapper_filename = None + if self.functional_simulation: + bd_base_sim = f"{vivado_stitch_proj_dir}/{prjname}.sim/sim_1/synth/func/xsim/" + fifosim_wrapper_filename = f"{bd_base_sim}/fifosim_wrapper_func_synth.v" + + tcl.extend( + [ + f"launch_runs synth_1 -jobs {num_workers}", + "wait_on_run [get_runs synth_1]", + "open_run synth_1 -name synth_1", + "opt_design", + # "opt_design -muxf_remap -carry_remap -control_set_merge " + # "-merge_equivalent_drivers -mbufg_opt -dsp_register_opt " + # "-control_set_opt -remap -resynth_area -resynth_remap", + # "opt_design", + f"write_verilog -mode funcsim -force -file {fifosim_wrapper_filename}", + ] + ) + + model.set_metadata_prop("wrapper_filename", fifosim_wrapper_filename) + # Synthesize to DCP and export stub, DCP and constraints if self.vitis: - tcl.append( - "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]" % bd_filename - ) - tcl.append( - "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} " - "-value {-mode out_of_context} -objects [get_runs synth_1]" - ) - num_workers = get_num_default_workers() - assert num_workers >= 0, "Number of workers must be nonnegative." - if num_workers == 0: - num_workers = mp.cpu_count() - tcl.append("launch_runs synth_1 -jobs %s" % str(num_workers)) - tcl.append("wait_on_run [get_runs synth_1]") - tcl.append("open_run synth_1 -name synth_1") - tcl.append("write_verilog -force -mode synth_stub %s.v" % block_name) - tcl.append("write_checkpoint %s.dcp" % block_name) - tcl.append("write_xdc %s.xdc" % block_name) - tcl.append( - "report_utilization -hierarchical -hierarchical_depth 5 " - "-file %s_partition_util.rpt" % block_name - ) - # export block design itself as an IP core + tcl.extend( + [ + f"set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files {bd_filename} ]", + "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} " + "-value {-mode out_of_context} -objects [get_runs synth_1]", + f"launch_runs synth_1 -jobs {num_workers}", + "wait_on_run [get_runs synth_1]", + "open_run synth_1 -name synth_1", + f"write_verilog -force -mode synth_stub {block_name}.v", + f"write_checkpoint {block_name}.dcp", + f"write_xdc {block_name}.xdc", + f"report_utilization -hierarchical -hierarchical_depth 5 " + f"-file {block_name}_partition_util.rpt", + ] + ) + # Export block design itself as an IP core block_vendor = "xilinx_finn" block_library = "finn" - block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name) + block_vlnv = f"{block_vendor}:{block_library}:{block_name}:1.0" model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv) model.set_metadata_prop("vivado_stitch_ifnames", json.dumps(self.intf_names)) - tcl.append( - ( - "ipx::package_project -root_dir %s/ip -vendor %s " - "-library %s -taxonomy /UserIP -module %s -import_files" - ) - % (vivado_stitch_proj_dir, block_vendor, block_library, block_name) - ) - # Allow user to customize clock in deployment of stitched IP - tcl.append("set_property ipi_drc {ignore_freq_hz true} [ipx::current_core]") - # in some cases, the IP packager seems to infer an aperture of 64K or 4G, - # preventing address assignment of the DDR_LOW and/or DDR_HIGH segments - # the following is a hotfix to remove this aperture during IODMA packaging - tcl.append( - "ipx::remove_segment -quiet m_axi_gmem0:APERTURE_0 " - "[ipx::get_address_spaces m_axi_gmem0 -of_objects [ipx::current_core]]" - ) - tcl.append("set_property core_revision 2 [ipx::find_open_core %s]" % block_vlnv) - tcl.append("ipx::create_xgui_files [ipx::find_open_core %s]" % block_vlnv) - # mark bus interface params as user-resolvable to avoid FREQ_MHZ mismatches - tcl.append( - "set_property value_resolve_type user [ipx::get_bus_parameters " - "-of [ipx::get_bus_interfaces -of [ipx::current_core ]]]" + + # Package IP and configure properties + tcl.extend( + [ + f"ipx::package_project -root_dir {vivado_stitch_proj_dir}/ip " + f"-vendor {block_vendor} -library {block_library} -taxonomy /UserIP " + f"-module {block_name} -import_files", + "set_property ipi_drc {ignore_freq_hz true} [ipx::current_core]", + "ipx::remove_segment -quiet m_axi_gmem0:APERTURE_0 " + "[ipx::get_address_spaces m_axi_gmem0 -of_objects [ipx::current_core]]", + f"set_property core_revision 2 [ipx::find_open_core {block_vlnv}]", + f"ipx::create_xgui_files [ipx::find_open_core {block_vlnv}]", + "set_property value_resolve_type user [ipx::get_bus_parameters " + "-of [ipx::get_bus_interfaces -of [ipx::current_core ]]]", + ] ) - # if targeting Vitis, add some properties to the IP + # If targeting Vitis, add some properties to the IP if self.vitis: - # replace source code with dcp - tcl.append("set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv) - tcl.append("set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv) - tcl.append("set_property supported_families { } [ipx::find_open_core %s]" % block_vlnv) - tcl.append( - "set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} " - "[ipx::find_open_core %s]" % block_vlnv - ) - tcl.append( - "set_property auto_family_support_level level_2 " - "[ipx::find_open_core %s]" % block_vlnv - ) - # remove all files from synthesis and sim groups - # we'll replace with DCP, stub, and xdc - tcl.append( - "ipx::remove_all_file " - "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]" - ) - tcl.append("ipx::remove_all_file " "[ipx::get_file_groups xilinx_anylanguagesynthesis]") - tcl.append( - "ipx::remove_file_group " - "xilinx_anylanguagebehavioralsimulation [ipx::current_core]" - ) - tcl.append("ipx::remove_file_group " "xilinx_anylanguagesynthesis [ipx::current_core]") - # remove sim and src folders - tcl.append("file delete -force %s/ip/sim" % vivado_stitch_proj_dir) - tcl.append("file delete -force %s/ip/src" % vivado_stitch_proj_dir) - # copy and add DCP, stub, and xdc - tcl.append("file mkdir %s/ip/dcp" % vivado_stitch_proj_dir) - tcl.append("file mkdir %s/ip/impl" % vivado_stitch_proj_dir) - tcl.append("file copy -force %s.dcp %s/ip/dcp" % (block_name, vivado_stitch_proj_dir)) - tcl.append("file copy -force %s.xdc %s/ip/impl" % (block_name, vivado_stitch_proj_dir)) - tcl.append("ipx::add_file_group xilinx_implementation [ipx::current_core]") - tcl.append( - "ipx::add_file impl/%s.xdc [ipx::get_file_groups xilinx_implementation]" - % block_name - ) - tcl.append( - "set_property used_in [list implementation] " - "[ipx::get_files impl/%s.xdc " - "-of_objects [ipx::get_file_groups xilinx_implementation]]" % block_name - ) - tcl.append("ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]") - tcl.append( - "ipx::add_file dcp/%s.dcp " - "[ipx::get_file_groups xilinx_synthesischeckpoint]" % block_name - ) - tcl.append("ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]") - tcl.append( - "ipx::add_file dcp/%s.dcp " - "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name + # Configure Vitis kernel properties + tcl.extend( + [ + f"set_property sdx_kernel true [ipx::find_open_core {block_vlnv}]", + f"set_property sdx_kernel_type rtl [ipx::find_open_core {block_vlnv}]", + f"set_property supported_families {{}} [ipx::find_open_core {block_vlnv}]", + f"set_property xpm_libraries {{XPM_CDC XPM_MEMORY XPM_FIFO}} " + f"[ipx::find_open_core {block_vlnv}]", + f"set_property auto_family_support_level level_2 " + f"[ipx::find_open_core {block_vlnv}]", + ] + ) + + # Remove all files from synthesis and sim groups and replace with DCP + tcl.extend( + [ + "ipx::remove_all_file " + "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]", + "ipx::remove_all_file [ipx::get_file_groups xilinx_anylanguagesynthesis]", + "ipx::remove_file_group " + "xilinx_anylanguagebehavioralsimulation [ipx::current_core]", + "ipx::remove_file_group xilinx_anylanguagesynthesis [ipx::current_core]", + ] + ) + + # Setup file structure for DCP-based IP + tcl.extend( + [ + f"file delete -force {vivado_stitch_proj_dir}/ip/sim", + f"file delete -force {vivado_stitch_proj_dir}/ip/src", + f"file mkdir {vivado_stitch_proj_dir}/ip/dcp", + f"file mkdir {vivado_stitch_proj_dir}/ip/impl", + f"file copy -force {block_name}.dcp {vivado_stitch_proj_dir}/ip/dcp", + f"file copy -force {block_name}.xdc {vivado_stitch_proj_dir}/ip/impl", + ] + ) + + # Add implementation and checkpoint file groups + tcl.extend( + [ + "ipx::add_file_group xilinx_implementation [ipx::current_core]", + f"ipx::add_file impl/{block_name}.xdc " + "[ipx::get_file_groups xilinx_implementation]", + f"set_property used_in [list implementation] " + f"[ipx::get_files impl/{block_name}.xdc " + f"-of_objects [ipx::get_file_groups xilinx_implementation]]", + "ipx::add_file_group xilinx_synthesischeckpoint [ipx::current_core]", + f"ipx::add_file dcp/{block_name}.dcp " + f"[ipx::get_file_groups xilinx_synthesischeckpoint]", + "ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]", + f"ipx::add_file dcp/{block_name}.dcp " + f"[ipx::get_file_groups xilinx_simulationcheckpoint]", + ] ) # add a rudimentary driver mdd to get correct ranges in xparameters.h later on min_driver = get_templates_folder() / "ipcore_driver" @@ -626,30 +710,30 @@ def apply(self, model): """ ) - # export list of used Verilog files (for rtlsim later on) - tcl.append( - "set all_v_files [get_files -filter {USED_IN_SYNTHESIS == 1 " - + "&& (FILE_TYPE == Verilog || FILE_TYPE == SystemVerilog " - + '|| FILE_TYPE =="Verilog Header")}]' + # Export list of used Verilog files (for rtlsim later on) + v_file_list = f"{vivado_stitch_proj_dir}/all_verilog_srcs.txt" + tcl.extend( + [ + "set all_v_files [get_files -filter {USED_IN_SYNTHESIS == 1 " + "&& (FILE_TYPE == Verilog || FILE_TYPE == SystemVerilog " + '|| FILE_TYPE =="Verilog Header")}]', + f"set fp [open {v_file_list} w]", + "foreach vf $all_v_files {puts $fp $vf}", + "close $fp", + ] ) - v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir - tcl.append("set fp [open %s w]" % v_file_list) - # write each verilog filename to all_verilog_srcs.txt - tcl.append("foreach vf $all_v_files {puts $fp $vf}") - tcl.append("close $fp") # write the project creator tcl script tcl_string = "\n".join(tcl) + "\n" - with open(vivado_stitch_proj_dir + "/make_project.tcl", "w") as f: + with Path(f"{vivado_stitch_proj_dir}/make_project.tcl").open("w") as f: f.write(tcl_string) # create a shell script and call Vivado - make_project_sh = vivado_stitch_proj_dir + "/make_project.sh" - working_dir = os.getcwd() - with open(make_project_sh, "w") as f: + make_project_sh = f"{vivado_stitch_proj_dir}/make_project.sh" + working_dir = Path.cwd() + with Path(make_project_sh).open("w") as f: f.write("#!/bin/bash \n") - f.write("cd {}\n".format(vivado_stitch_proj_dir)) - f.write("set -e\n") # Exit with non-zero if vivado fails. + f.write(f"cd {vivado_stitch_proj_dir}\n") f.write("vivado -mode batch -source make_project.tcl\n") - f.write("cd {}\n".format(working_dir)) + f.write(f"cd {working_dir}\n") bash_command = ["bash", make_project_sh] try: @@ -661,17 +745,22 @@ def apply(self, model): f"{vivado_stitch_proj_dir} to find out why it failed." ) from e + if self.functional_simulation: + with Path(v_file_list).open("a") as f: + f.write(f"{fifosim_wrapper_filename}\n") + # wrapper may be created in different location depending on Vivado version - if not os.path.isfile(wrapper_filename): + if not Path(wrapper_filename).is_file(): # check in alternative location (.gen instead of .srcs) wrapper_filename_alt = wrapper_filename.replace(".srcs", ".gen") - if os.path.isfile(wrapper_filename_alt): - model.set_metadata_prop("wrapper_filename", wrapper_filename_alt) + if Path(wrapper_filename_alt).is_file(): + if not self.functional_simulation: + model.set_metadata_prop("wrapper_filename", wrapper_filename_alt) else: - raise FINNError( - """CreateStitchedIP failed, no wrapper HDL found under %s or %s. + raise FINNUserError( + f"""CreateStitchedIP failed, no wrapper HDL found \ + under {wrapper_filename} or {wrapper_filename_alt}. Please check logs under the parent directory.""" - % (wrapper_filename, wrapper_filename_alt) ) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py index ebc60d31d6..8b390eaa00 100644 --- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py +++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py @@ -71,7 +71,7 @@ def applyNodeLocal(self, node): # call the compilation function for this node inst.ipgen_singlenode_code() else: - log.info(f"Using pre-existing IP for {node.name}") + log.debug(f"Using pre-existing IP for {node.name}") # ensure that executable path is now set assert ( inst.get_nodeattr("ipgen_path") != "" diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 94f2c29605..653d863283 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -133,6 +133,8 @@ def apply(self, model): # check if outFIFOdepths attribute of first node # and inFIFOdepths attribute of consumer node is equal + idx_out = min(idx_out, len(n0.get_nodeattr("outFIFODepths")) - 1) + idx_inp = min(idx_inp, len(n1.get_nodeattr("inFIFODepths")) - 1) n0_depth = n0.get_nodeattr("outFIFODepths")[idx_out] n1_depth = n1.get_nodeattr("inFIFODepths")[idx_inp] diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py index bc1d66c71f..4a2f8997a8 100644 --- a/src/finn/transformation/fpgadataflow/make_driver.py +++ b/src/finn/transformation/fpgadataflow/make_driver.py @@ -108,7 +108,7 @@ def resolve_dt_name(s: str) -> str: if s in ["BINARY", "TERNARY", "BIPOLAR"]: return "Datatype" + s[0] + s[1:].lower() elif s.startswith("U"): - return "DatatypeUint<" + s.replace("UINT", "") + ">" + return "DatatypeUInt<" + s.replace("UINT", "") + ">" elif s.startswith("I"): return "DatatypeInt<" + s.replace("INT", "") + ">" elif "FLOAT" in s: diff --git a/src/finn/transformation/fpgadataflow/prepare_ip.py b/src/finn/transformation/fpgadataflow/prepare_ip.py index a60c8b6b49..57539c9afc 100644 --- a/src/finn/transformation/fpgadataflow/prepare_ip.py +++ b/src/finn/transformation/fpgadataflow/prepare_ip.py @@ -53,7 +53,7 @@ def _codegen_single_node(node, model, fpgapart, clk): # ensure that there is generated code inside the dir inst.code_generation_ipgen(model, fpgapart, clk) else: - log.info(f"Using pre-existing code for {node.name}") + log.debug(f"Using pre-existing code for {node.name}") except KeyError: # exception if op_type is not supported raise Exception(f"Custom op_type {op_type} is currently not supported.") diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index a87c00c802..a717354cb3 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -220,11 +220,15 @@ def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, Literal[False]]: def xsi_fifosim( - model: ModelWrapper, - n_inferences: int, - max_iters: float | None = None, - throttle_cycles: int = 0, -) -> dict[str, int]: + model, + n_inferences, + is_single_node, + total_nodes: int = 1, + current_node_index: int | None = None, + previous_node_name: str | None = None, + max_iters=None, + throttle_cycles=0, +): """Create a XSI model of stitched IP and use a simple C++ driver to drive the input stream. Useful for FIFO sizing, latency and throughput measurement. If max_iters is None, use the default @@ -243,6 +247,10 @@ def xsi_fifosim( ret_dict = rtlsim_exec_cppxsi( model, ctx, + is_single_node, + total_nodes=total_nodes, + current_node_index=current_node_index, + previous_node_name=previous_node_name, dummy_data_mode=True, timeout_cycles=max_iters, throttle_cycles=throttle_cycles, @@ -405,7 +413,11 @@ def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, Literal[False]]: throttle_cycles = 0 sim = xsi_fifosim( - model, self.cfg_n_inferences, max_iters=max_iters, throttle_cycles=int(throttle_cycles) + model, + self.cfg_n_inferences, + False, + max_iters=max_iters, + throttle_cycles=int(throttle_cycles), ) for ind, node in enumerate(fifo_nodes): @@ -451,9 +463,9 @@ def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, Literal[False]]: reset_implementation(node_inst) modified_fc_nodes.remove(node.name) - assert ( - len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0 - ), "FIFO/FC nodes left untouched after model reconfiguration" + assert len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0, ( + "FIFO/FC nodes left untouched after model reconfiguration" + ) # handle custom sizing for SWG FIFOs if desired if self.swg_exception: @@ -608,6 +620,10 @@ def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, Literal[False]]: dtype = cast("str", n_inst.get_nodeattr("dataType")) ram_style = n_inst.get_nodeattr("ram_style") shape = model.get_tensor_shape(node.input[0]) + log.info( + f"Splitting FIFO {node.name} of depth {depth} " + f"into {len(cfgs)} FIFOs with depths {[c[0] for c in cfgs]}" + ) for i, (fifo_depth, impl_style) in enumerate(cfgs): inp = node.input[0] if i == 0 else node.name + "_" + str(i - 1) + "_out" if i == len(cfgs) - 1: diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 6a23727bb3..dc069e9462 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -265,7 +265,7 @@ def apply(self, model): node_inst.set_nodeattr("SIMD", 1) channels_per_stream = node_inst.get_nodeattr("ChannelsPerStream") for simd_val in common_divisors(channels_per_stream): - node_inst.set_nodeattr("SIMD", simd_val) + node_inst.set_nodeattr("SIMD", int(simd_val)) cyc = node_inst.get_exp_cycles() if cyc < self.target_cycles_per_frame: break @@ -274,7 +274,7 @@ def apply(self, model): dim = int(node_inst.get_normal_input_shape()[-1]) for simd_val in divisors(dim): if dim // simd_val > 12: - node_inst.set_nodeattr("SIMD", simd_val) + node_inst.set_nodeattr("SIMD", int(simd_val)) cyc = node_inst.get_exp_cycles() if cyc < self.target_cycles_per_frame: break diff --git a/src/finn/transformation/fpgadataflow/simulation.py b/src/finn/transformation/fpgadataflow/simulation.py new file mode 100644 index 0000000000..e0d57197b4 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/simulation.py @@ -0,0 +1,278 @@ +"""Manages the Simulation superclass as well as general simulation related transforms.""" + +import json +import pandas as pd +from pathlib import Path +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation +from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from typing import Any, TypeAlias, cast + +from finn.builder.build_dataflow_config import DataflowBuildConfig +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.simulation_build import BuildSimulation, SimulationType +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.exception import FINNInternalError, FINNUserError +from finn.util.logging import log + +FIFODepthConfig: TypeAlias = list[dict[str, list[int]]] + + +def store_fifo_data( + model: ModelWrapper, + data: pd.DataFrame, + default_path: Path, + delete_existing: bool, + sort_on: str = "onnx_index", + merge_on: list[str] | None = None, + merge_how: str = "inner", + store_html: bool = True, +) -> ModelWrapper: + """Store the given dataframe in a CSV file. + + If the model already points to data, merge with it and store at the + path used before (unless delete_existing=True, then simply overwrite at that same path). + If no data is stored beforehand, use the `default_path` and simply store + the data there. The path is then entered into the `"fifo_data_path"` metadata prop of the model. + + The function can be used to aggregate benchmarking data across several flow steps. + + Args: + model: The model that we check for a path to existing FIFO data. + data: The data to store. + default_path: Path to use in case that the model doesn't reference a data file yet. + Is then stored as a metadata prop in the model. + delete_existing: If true, delete the table and start a new one. + sort_on: The column to sort on after merging. + merge_on: What columns to merge on. If "None", use `["onnx_index", "node", "stream"]` + merge_how: How to merge. Forwarded to pd.merge(). + store_html: If True, also store the data as a HTML with the same name next to the CSV. + + Returns: + model: Return the model since we might have modified its metadata. + """ + # Check if all layers are accounted for + # Note: data may have multiple rows per node (one per output stream) + if "node" in data.columns: + num_unique_nodes = len(data["node"].unique()) + if num_unique_nodes != len(model.graph.node): + raise FINNInternalError( + f"Tried storing FIFO data for {num_unique_nodes} unique nodes " + f"but expected {len(model.graph.node)}" + ) + elif len(data.index) != len(model.graph.node): + raise FINNInternalError( + f"Tried storing FIFO data for {len(data.index)} " + f"values but expected {len(model.graph.node)}" + ) + fifo_data_path = model.get_metadata_prop("fifo_data_path") + if fifo_data_path is not None: + if not fifo_data_path.endswith(".csv"): + raise FINNInternalError( + f"It seems the model saved path to store " + f"the dataframe does not point to a csv file: {fifo_data_path}" + ) + if delete_existing: + Path(fifo_data_path).unlink(missing_ok=True) + merged = data + else: + merged = pd.merge( + data, pd.read_csv(fifo_data_path), on=merge_on, how=merge_how # type: ignore + ) + merged = merged.sort_values(sort_on) + merged.to_csv(fifo_data_path, index=False) + if store_html: + merged.to_html(fifo_data_path.replace(".csv", ".html")) + log.info(f"Stored FIFO dataframe to {fifo_data_path}.") + else: + if not default_path.suffix == ".csv": + raise FINNInternalError( + f"It seems the provided default path to store " + f"the dataframe does not point to a csv file: {fifo_data_path}" + ) + if delete_existing: + default_path.unlink(missing_ok=True) + data.to_csv(default_path, index=False) + if store_html: + data.to_html(str(default_path).replace(".csv", ".html")) + model.set_metadata_prop("fifo_data_path", str(default_path)) + log.info(f"Stored FIFO dataframe to {default_path}.") + return model + + +class Simulation: + """Manage simulation (runs) in FINN. Upon instance creation, the simulation will be built. + Simulations should inherit from this class and expand for their specific needs. + + IMPORTANT: If the modelwrapper was somehow changed, create a NEW simulation object! + """ + + def __init__( + self, + model: ModelWrapper, + simulation_type: SimulationType, + fpgapart: str, + clk_ns: float, + functional_sim: bool, + workers: int | None = None, + ) -> None: + """Create a new simulation instance. Read simulation binary paths + from the simulation_binaries metadata prop field.""" + self.simulation_type = simulation_type + self.model = model + sim_binaries = self.model.get_metadata_prop("simulation_binaries") + + if sim_binaries is None: + raise FINNUserError( + "No field simulation_binaries found in the model. Make " + "sure to run the BuildSimulation transformation beforehand." + ) + sim_binaries: list[Path] = [Path(p) for p in str(sim_binaries).split("\n")] + if len(sim_binaries) != len(self.model.graph.node): + raise FINNUserError( + "The number of found simulation binaries does not match the number " + "of nodes in the graph. Make sure to run BuildSimulation just " + "before." + ) + if any(not p.exists() for p in sim_binaries): + raise FINNUserError( + "Simulation binary data points to invalid paths. Please rerun BuildSimulation." + ) + # TODO: Currently we have to recompile even if we just + # TODO: called BuildSimulation in the step before + # (However this only compiles, it should NOT stitch the IPs again) + self.model = self.model.transform(BuildSimulation(fpgapart, clk_ns, functional_sim)) + self.binaries: dict[int, Path] = {i: sim_binaries[i] for i in range(len(sim_binaries))} + match simulation_type: + case SimulationType.NODE_BASED_CONNECTED: + self.binaries = { + i: self.binaries[i] / "LayerSimulationBackend" for i in self.binaries.keys() + } + case SimulationType.NODE_BASED_ISOLATED: + self.binaries = { + i: self.binaries[i] / "IsolatedSimulationBackend" for i in self.binaries.keys() + } + case _: + raise FINNInternalError(f"Unsupported simulation type: {simulation_type}") + + errors = [] + for binary in self.binaries.values(): + if not binary.exists(): + errors.append(f"Binary {binary} does not exist! Please rerun BuildSimulation!") + if len(errors) > 0: + raise FINNInternalError("Errors occurred: \n" + "\n\t".join(errors)) + + def simulate(self) -> Any: + raise NotImplementedError("Call simulate() on subclasses.") + + +class ApplyFIFOSizes(Transformation): + """Apply a FIFO sizing configuration to the model. + If FIFOs already exist the step is skipped.""" + + def __init__( + self, + cfg: DataflowBuildConfig, + fifo_config: Path | None = None, + max_qsrl_depth: int = 256, + vivado_ram_style: str = "block", + ) -> None: + """If given read the config json from the given path. + Otherwise check in the output directory. + """ + self.cfg = cfg + self.max_qsrl_depth = max_qsrl_depth + self.vivado_ram_style = vivado_ram_style + if fifo_config is None: + self.path = Path(cfg.output_dir) / "fifo_config.json" + else: + self.path = fifo_config + + self.fifo_depths: FIFODepthConfig = [] + with self.path.open() as f: + self.fifo_depths = cast("FIFODepthConfig", json.load(f)) + + def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, bool]: + """Apply FIFO Simulation Depths to the model.""" + if len(list(filter(lambda node: "StreamingFIFO" in node.op_type, model.graph.node))) > 0: + log.warning( + "It seems that StreamingFIFOs have already " + "been inserted into the graph. Skipping insertion of FIFOs." + ) + return model, False + + if len(model.graph.node) != len(self.fifo_depths): + raise FINNUserError( + "There are no StreamingFIFOs in the graph, yet the number " + "of nodes and number of FIFO sizes differ. There may be " + "unaccounted for nodes that have not been part of the FIFO " + "simulation. Consider re-running simulation directly before " + "applying the FIFO sizes. It might also be that your model " + "or config is outdated, in which case it is recommended to " + "re-run the entire flow from start to finish." + ) + + # FIFO sizes are set as the maximum of outFIFODepth and inFIFODepth of the successor node + # Only set the outFIFODepth, because setting both is redundant as inFIFODepth defaults to 0. + # Remove all in/outFIFODepths in model for clean slate + graph = model.graph + for node in graph.node: + predecessors = model.find_direct_predecessors(node) + successors = model.find_direct_successors(node) + n = getCustomOp(node) + if n is not None: + if predecessors is not None: + n.set_nodeattr("inFIFODepths", [0] * len(predecessors)) + if successors is not None: + n.set_nodeattr("outFIFODepths", [0] * len(successors)) + + # Set new outFIFODepths according to config + graph = model.graph + node_ind = -1 + for first_node in graph.node: + node_ind += 1 + n0 = getCustomOp(first_node) + if n0 is None: + raise FINNInternalError( + f"Node {first_node.name} does not have a custom op instance." + " This is required for FIFO insertion." + ) + if first_node.name != self.fifo_depths[node_ind]["node"]: + raise FINNInternalError( + f"Node name {first_node.name} does not match expected name " + f"{self.fifo_depths[node_ind]['node']} at index {node_ind}. " + "This may be due to a mismatch between the model and the config, " + "or due to changes in the model after the simulation was run. " + "Consider re-running the entire flow from start to finish." + ) + fifos = cast("list[int]", (self.fifo_depths[node_ind]["depths"])) + n0.set_nodeattr("outFIFODepths", fifos) + + # Insert the FIFOs into the model + model = model.transform(InsertFIFO(True, self.max_qsrl_depth, self.vivado_ram_style)) + + model = model.transform(GiveUniqueNodeNames()) + model: ModelWrapper = model.transform(GiveReadableTensorNames()) + model = model.transform(SpecializeLayers(self.cfg._resolve_fpga_part())) # noqa + model = model.transform(GiveUniqueNodeNames()) + model: ModelWrapper = model.transform(GiveReadableTensorNames()) + + # Sanity check to make sure fifos were inserted + inserted_fifo_count = sum( + [int("StreamingFIFO" in node.op_type) for node in model.graph.node] + ) + if inserted_fifo_count == 0: + raise FINNInternalError( + "No FIFOs were inserted. This may be due to " + "wrong network configuration, step order or " + "a number of other things." + ) + if inserted_fifo_count < int(0.4 * float(len(model.graph.node))): + log.warning( + "The number of inserted FIFOs makes up less than 40%" + " of the total number of nodes in the model. This could " + "point to a potential error." + ) + + return model, False diff --git a/src/finn/transformation/fpgadataflow/simulation_build.py b/src/finn/transformation/fpgadataflow/simulation_build.py new file mode 100644 index 0000000000..a21102b8f0 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/simulation_build.py @@ -0,0 +1,832 @@ +"""Build FINN Simulations.""" + +import finn_xsi.adapter as finnxsi +import numpy as np +import onnx +import os +import psutil +import shlex +import subprocess +import sys +import time +from ast import literal_eval +from collections.abc import Callable +from concurrent.futures import Future, ThreadPoolExecutor +from enum import Enum +from onnx import NodeProto, TensorProto, ValueInfoProto +from pathlib import Path +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation +from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import get_by_name +from subprocess import CalledProcessError +from typing import TYPE_CHECKING, Any, cast + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.basic import launch_process_helper, make_build_dir +from finn.util.exception import FINNInternalError, FINNUserError +from finn.util.logging import log + +if TYPE_CHECKING: + from collections.abc import Sequence + + +# TODO: Fix that BuildSimulation has to return binaries for either SimulationType +# TODO: Just store the directory instead - since we build all targets anyways + + +class SimulationType(str, Enum): + """Type of simulation.""" + + # Individual node simulations connected by IPC + NODE_BASED_CONNECTED = "NODE_BASED_CONNECTED" + + # Individual node simulations, isolated. E.g. for analysis purposes + NODE_BASED_ISOLATED = "NODE_BASED_ISOLATED" + + +class SimulationBuilder: + """Build simulations in FINN.""" + + def __init__(self, model: ModelWrapper, fpgapart: str, clk_ns: float) -> None: + """Create a new simulation instance.""" + self.model = model + self.fpgapart = fpgapart + self.clk_ns = clk_ns + + def _isolated_node_model(self, by_node: int | str | NodeProto) -> ModelWrapper: + """Return a modelwrapper that has only the specified node. + + Args: + by_node: If int, used as the index of the specified node. If string, assumed to be + the name of the node. + + Returns: + ModelWrapper: The isolated-node modelwrapper. + """ + # Find the node + index = 0 + if type(by_node) is int: + if by_node < 0 or by_node >= len(self.model.graph.node): + raise FINNInternalError( + f"Cannot isolate node index {by_node}. Model has" + f"{len(self.model.graph.node)} nodes." + ) + index = by_node + elif type(by_node) is str: + node_name = self.model.get_node_from_name(by_node) + if node_name is None: + raise FINNInternalError(f"Cannot isolate node {by_node}. No such node found.") + index = [n.name for n in self.model.graph.node].index(cast("str", node_name)) + elif type(by_node) is NodeProto: + try: + index = self.model.graph.node.index(by_node) + except Exception as e: + raise FINNInternalError(f"Node {by_node.name} not found in the model.") from e + else: + raise FINNInternalError( + f"Cannot find node to isolate: {by_node}. Specify either " + f"the index (int), node name (str) or the object itself " + f"(NodeProto)." + ) + + target_op = getCustomOp(self.model.graph.node[index]) + if not isinstance(target_op, HWCustomOp): + raise FINNInternalError( + f"Node {target_op.name} is not a HWCustomOp, cannot isolate for simulation." + ) + + initializers: list[TensorProto] = [] + value_info_protos: list[ValueInfoProto] = [] + inputs_graph: list[ValueInfoProto] = [] + inputs_node: list[ValueInfoProto] = [] + outputs_graph: list[ValueInfoProto] = [] + outputs_node: list[ValueInfoProto] = [] + nodes_graph: list[NodeProto] = [] + + preds_list: list | None = self.model.find_direct_predecessors(self.model.graph.node[index]) + succs_list: list | None = self.model.find_direct_successors(self.model.graph.node[index]) + + num_preds = len(preds_list) if preds_list is not None else 0 + num_succs = len(succs_list) if succs_list is not None else 0 + + input_node = False + output_node = False + + # Set correct input/output count for input and output nodes, since they have no pred/succ. + if num_preds == 0: + inputs = self.model.graph.input + ret = get_by_name( + inputs, self.model.graph.node[index].input[0] + ) # Check that node is graph input + if ret is not None: + num_preds = 1 + input_node = True + if num_succs == 0: + outputs = self.model.graph.output + ret = get_by_name( + outputs, self.model.graph.node[index].output[0] + ) # Check that node is graph output + if ret is not None: + num_succs = 1 + output_node = True + + num_inputs = len(self.model.graph.node[index].input) + num_outputs = len(self.model.graph.node[index].output) + + if num_outputs != num_succs: + raise FINNInternalError( + f"Node {self.model.graph.node[index].name} has {num_outputs} outputs but " + f"{num_succs} successor nodes. This is not supported for isolation." + ) + + initializer_inputs_list = [ + self.model.graph.node[index].input[i] + for i in range(num_inputs) + if self.model.get_initializer(self.model.graph.node[index].input[i]) is not None + ] + + # Handle initializers of nodes + initializer_inputs = [] + for init in initializer_inputs_list: + ret = self.model.get_initializer(init, return_dtype=True) + info = self.model.get_tensor_valueinfo(init) + if ret is None or info is None: + raise FINNInternalError( + f"Failed to get initializer for {init} " + f"while isolating node {self.model.graph.node[index].name}." + ) + vals, dtype = cast("tuple[np.ndarray, int]", ret) + initializers.append(onnx.helper.make_tensor(info.name, dtype, vals.shape, vals)) + val_info = onnx.helper.make_tensor_value_info(info.name, dtype, vals.shape) + value_info_protos.append(val_info) + initializer_inputs.append(val_info) + + pred_count = 0 + for i in range(num_inputs): + if self.model.graph.node[index].input[i] in initializer_inputs_list: + continue # This input is handled as an initializer, skip + pred_count += 1 + info = self.model.get_tensor_valueinfo(self.model.graph.node[index].input[i]) + if info is None: + raise FINNInternalError( + f"Failed to get value info for {self.model.graph.node[index].input[i]} " + f"while isolating node {self.model.graph.node[index].name}." + ) + # Setup new input tensors + new_input_info = onnx.helper.make_tensor_value_info( + info.name, + TensorProto.FLOAT, + cast("Sequence[int]", target_op.get_normal_input_shape(i)), + ) + new_input_dummy_info = onnx.helper.make_tensor_value_info( + info.name + "_dummy", + TensorProto.FLOAT, + cast("Sequence[int]", target_op.get_normal_input_shape(i)), + ) + # value_info_protos.append(new_input_info) + value_info_protos.append(new_input_dummy_info) + inputs_graph.append(new_input_info) + inputs_node.append(new_input_dummy_info) + + # Create new dummy node to remove data path for input i + dummy_node = onnx.helper.make_node( + "RemoveDataPath_rtl", + inputs=[new_input_info.name], + outputs=[new_input_dummy_info.name], + domain="finn.custom_op.fpgadataflow.rtl", + backend="fpgadataflow", + folded_shape=target_op.get_folded_input_shape(i), + normal_shape=target_op.get_normal_input_shape(i), + dataType=target_op.get_input_datatype(i).name, + name=self.model.graph.node[index].name + f"_input_dummy_{i}", + ) + + nodes_graph.append(dummy_node) + inputs_node.extend(initializer_inputs) + if pred_count != num_preds: + raise FINNInternalError( + f"Node {self.model.graph.node[index].name} has {num_preds} pred. nodes but only " + f"{pred_count} inputs have been handled." + ) + for i in range(num_succs): + info = self.model.get_tensor_valueinfo(self.model.graph.node[index].output[i]) + if info is None: + raise FINNInternalError( + f"Failed to get value info for {self.model.graph.node[index].output[i]} " + f"while isolating node {self.model.graph.node[index].name}." + ) + # Setup new input tensors + new_output_info = onnx.helper.make_tensor_value_info( + info.name, + TensorProto.FLOAT, + cast("Sequence[int]", target_op.get_normal_output_shape(i)), + ) + new_output_dummy_info = onnx.helper.make_tensor_value_info( + info.name + "_dummy", + TensorProto.FLOAT, + cast("Sequence[int]", target_op.get_normal_output_shape(i)), + ) + # value_info_protos.append(new_output_info) + value_info_protos.append(new_output_dummy_info) + outputs_graph.append(new_output_info) + outputs_node.append(new_output_dummy_info) + + # Create new dummy node to remove data path for output i + dummy_node = onnx.helper.make_node( + "RemoveDataPath_rtl", + inputs=[new_output_dummy_info.name], + outputs=[new_output_info.name], + domain="finn.custom_op.fpgadataflow.rtl", + backend="fpgadataflow", + folded_shape=target_op.get_folded_output_shape(i), + normal_shape=target_op.get_normal_output_shape(i), + dataType=target_op.get_output_datatype(i).name, + name=self.model.graph.node[index].name + f"_output_dummy_{i}", + ) + + nodes_graph.append(dummy_node) + + target_op_attrs = target_op.get_nodeattr_types() + params = {} + for attr in target_op_attrs.keys(): + attr_val = target_op.get_nodeattr(attr) + if ( + (isinstance(attr_val, np.ndarray) and attr_val.size == 0) + or attr_val == "" + or attr_val == [] + ): # Empty value, skip + continue + params[attr] = target_op.get_nodeattr(attr) + new_node = onnx.helper.make_node( + self.model.graph.node[index].op_type, + inputs=[inp.name for inp in inputs_node], + outputs=[outp.name for outp in outputs_node], + domain=self.model.graph.node[index].domain, + name=self.model.graph.node[index].name, + **params, + ) + nodes_graph.append(new_node) + + graph = onnx.helper.make_graph( + nodes_graph, + f"isolated_node_graph_{self.model.graph.node[index].name}", + inputs_graph, + outputs_graph, + initializer=initializers, + value_info=value_info_protos, + ) + + node_model = onnx.helper.make_model(graph) + node_model = ModelWrapper(node_model) + + node_model.set_metadata_prop("predecessors", str([pred.name for pred in inputs_graph])) + node_model.set_metadata_prop("successors", str([succ.name for succ in outputs_graph])) + node_model.set_metadata_prop("input_node", str(input_node).lower()) + node_model.set_metadata_prop("output_node", str(output_node).lower()) + + # node_model.save(f"isolated_node_model_{self.model.graph.node[index].name}.onnx") + + return node_model + + def _get_stream_descriptions(self, model: ModelWrapper) -> tuple[str, str]: + """Return the stream descriptions for the given model for the C++ sim config header. + + Used by for example _build_single_node_simulation(). + + Returns: + tuple[str, str]: Strings of stream descriptions + """ + # Get IO iterations required + instream_iters = [] + outstream_iters = [] + for top_inp in model.graph.input: + iname = top_inp.name + first_node = model.find_consumer(iname) + assert first_node is not None, "Failed to find consumer for " + iname + top_ind = list(first_node.input).index(iname) + ishape_folded = getCustomOp(first_node).get_folded_input_shape(ind=top_ind) + instream_iters.append(int(np.prod(ishape_folded[:-1]))) + for top_out in model.graph.output: + oname = top_out.name + last_node = model.find_producer(oname) + assert last_node is not None, "Failed to find producer for " + oname + top_ind = list(last_node.output).index(oname) + oshape_folded = getCustomOp(last_node).get_folded_output_shape(ind=top_ind) + outstream_iters.append(int(np.prod(oshape_folded[:-1]))) + + interface_names = model.get_metadata_prop("vivado_stitch_ifnames") + if interface_names is None: + raise FINNInternalError( + f"{model}: Could not find stitched-IP interface names. " + f"Did you run IP Stitching first?" + ) + interface_names = literal_eval(interface_names) + if "aximm" in interface_names.keys() and interface_names["aximm"] != []: + raise FINNInternalError( + f"{model}: CPP XSI Sim does not know how to handle full " + f"AXI MM interfaces: {interface_names['aximm']}" + ) + instream_names = [x[0] for x in interface_names["s_axis"]] + outstream_names = [x[0] for x in interface_names["m_axis"]] + + # Convert to the format required by the C++ sim config header + # (initializer list of pairs of name and iters) + def _format_descr_name(s: list[tuple[str, int]]) -> str: + return ", ".join([f'StreamDescriptor{{"{name}", {iters}}}' for name, iters in s]) + + instream_descrs = [ + (instream_names[i], instream_iters[i]) for i in range(len(instream_names)) + ] + instream_descrs_str = _format_descr_name(instream_descrs) + + outstream_descrs = [ + (outstream_names[i], outstream_iters[i]) for i in range(len(outstream_names)) + ] + outstream_descrs_str = _format_descr_name(outstream_descrs) + return instream_descrs_str, outstream_descrs_str + + def _create_sim_so( + self, + model: ModelWrapper, + top_module_name: str, + vivado_stitched_proj: Path, + build_dir: Path | None, + debug: bool, + ) -> tuple[Path, Path]: + """Create a new RTLSim .so file. If one exists already it is used. + + Returns: + tuple[Path, Path]: Return sim_base and sim_rel. + """ + rtlsim_so_str = model.get_metadata_prop("rtlsim_so") + if (rtlsim_so_str is None) or not Path(rtlsim_so_str).exists(): + all_verilog_srcs = ( + (Path(vivado_stitched_proj) / "all_verilog_srcs.txt").read_text().split() + ) + sim_dir = ( + make_build_dir(f"rtlsim_{model.graph.node[0].name}_") + if build_dir is None + else build_dir + ) + sim_base, sim_rel = finnxsi.compile_sim_obj( + top_module_name, all_verilog_srcs, str(sim_dir), debug=debug + ) + rtlsim_so = Path(sim_base) / Path(sim_rel) + model.set_metadata_prop("rtlsim_so", str(rtlsim_so)) + else: + sim_base, sim_rel = cast("str", rtlsim_so_str.split("xsim.dir")) + sim_rel = "xsim.dir" + sim_rel + return Path(sim_base), Path(sim_rel) + + def _compile_simulation(self, sim_base: Path, silent: bool = True) -> Path: + """Compile an existing RTLSIM directory. Requires _create_sim_so to be run before. Expects + rtlsim_config.hpp to be templated already. + + Returns: + Path: Path to the executable shell script to run the binary + """ + # Determine executable name + compile_targets = ["LayerSimulationBackend", "IsolatedSimulationBackend"] + if all((Path(sim_base) / execname).exists() for execname in compile_targets): + # Simulation was already compiled, we can return early + return Path(sim_base) + + # Check where FINNXSI is + finnxsi_dir = os.environ["FINN_XSI"] + + # Running CMake first + cmake_call = f"{sys.executable} -m cmake -S {finnxsi_dir} -B {sim_base}" + log.debug(f"Running cmake on RTLSIM Wrapper in {sim_base}") + try: + launch_process_helper( + shlex.split(cmake_call), + cwd=finnxsi_dir, + print_stdout=silent, + print_stderr=silent, + proc_env=os.environ.copy(), + ) + except CalledProcessError as e: + raise FINNInternalError(f"Failed to run cmake in {sim_base}") from e + + # Calling make to actually build the simulation + makefile = Path(sim_base) / "Makefile" + if not makefile.exists(): + raise FINNInternalError(f"Failed to create Makefile in {sim_base}!") + try: + launch_process_helper( + ["make"], + proc_env=os.environ.copy(), + cwd=sim_base, + print_stdout=silent, + print_stderr=silent, + ) + except CalledProcessError as e: + raise FINNInternalError(f"Failed to create executable in {sim_base}!") from e + + errors = [] + for target in compile_targets: + simulation_executable = Path(sim_base) / target + if not simulation_executable.exists(): + errors.append( + f"Simulation compile target {target} was not created. " + f"Check {sim_base} to run make manually." + ) + if len(errors) > 0: + raise FINNInternalError("Error compiling simulations: \n" + "\n\t".join(errors)) + return sim_base + + def _template_rtlsim_config( + self, + model: ModelWrapper, + sim_base: Path, + input_interface_names: list[str] | None, + output_interface_names: list[str] | None, + node_index: int, + total_nodes: int, + timeout_cycles: int, + top_module_name: str, + trace_file: str | None, + ) -> Path: + """Template finn_xsi/finn_xsi/rtlsim_config.hpp.template with the correct values and + return the templated file. + """ + finnxsi_dir = os.environ["FINN_XSI"] + # Prepare the C++ driver config template + ( + instream_descrs_str, + outstream_descrs_str, + ) = self._get_stream_descriptions(model) + template_dict = { + "TIMEOUT_CYCLES": timeout_cycles, + # name of the top-level HDL module + "TOP_MODULE_NAME": top_module_name, + # top-level AXI stream descriptors + "ISTREAM_DESC": instream_descrs_str, + "OSTREAM_DESC": outstream_descrs_str, + # control tracing and trace filename + "TRACE_FILE": "std::nullopt" if trace_file is None else f'"{trace_file}"', + # sim kernel .so to use (depends on Vivado version) + "SIMKERNEL_SO": finnxsi.get_simkernel_so(), + # log file for xsi (not the sim driver) + "XSIM_LOG_FILE": '"xsi.log"', + "INPUT_INTERFACE_NAMES": ",".join(['"' + name + '"' for name in input_interface_names]) + if input_interface_names is not None + else "", + "OUTPUT_INTERFACE_NAMES": ",".join( + ['"' + name + '"' for name in output_interface_names] + ) + if output_interface_names is not None + else "", + "INPUT_INTERFACE_COUNT": len(input_interface_names) + if input_interface_names is not None + else 0, + "OUTPUT_INTERFACE_COUNT": len(output_interface_names) + if output_interface_names is not None + else 0, + "NODE_INDEX": node_index, + "TOTAL_NODES": total_nodes, + "IS_INPUT_NODE": model.get_metadata_prop("input_node"), + "IS_OUTPUT_NODE": model.get_metadata_prop("output_node"), + } + + fifosim_config_fname = Path(finnxsi_dir) / "rtlsim_config.hpp.template" + fsim_config = fifosim_config_fname.read_text() + for key, val in template_dict.items(): + fsim_config = fsim_config.replace(f"@{key}@", str(val)) + # Write the config to the simulation directory + rtlsim_config = Path(sim_base) / "rtlsim_config.hpp" + rtlsim_config.write_text(fsim_config) + return rtlsim_config + + def build_single_node_simulation( + self, + node_model: ModelWrapper, + node_index: int, + total_nodes: int, + input_interface_names: list[str] | None, + output_interface_names: list[str] | None, + build_dir: Path | None, + timeout_cycles: int = 0, + silent: bool = False, + ) -> Path: + """Build the simulation binary for a single node. + + This can be used both by the connected node-by-node sim and the isolated node sim. + + Much of this is from the rtlsim_exec.py in core/ + + Args: + node_model: The single node ModelWrapper to build the simulation from. + node_index: The index of the simulated node. Used to determine whether a node shares IO + with successors or predecessors. + total_nodes: The total number of nodes in the complete design. + input_interface_names: Names of input interfaces for IPC communication. Required by the + connected simulation to access the correct shared memory segment + between this node and its predecessors. + output_interface_names: Names of output interfaces for IPC communication. Required by + the connected simulation to access the correct shared memory segment + between this node and its successors. + build_dir: If given, use this directory for building the simulation. Otherwise one is + created from the nodes name. + timeout_cycles: Number of cycles until simulation timeout. When set to 0 (default), no + timeout is given. + silent: If True, silences the Cmake and make output (including stderr) + + Returns: + Path: The path to the simulation binary (shell script). + """ + # TODO: Check if something is an output node instead of checking the node index + # TODO: Requires changes in the C++ code as well + + # Check that the relevant data exists + wrapper_filename = node_model.get_metadata_prop("wrapper_filename") + if wrapper_filename is None or not Path(wrapper_filename).exists(): + raise FINNUserError( + f"Call CreateStitchedIP prior to building " + f"the simulation for {self.model.graph.node[node_index].name}. " + f"wrapper_filename is set to {wrapper_filename}!" + ) + + vivado_stitched_proj = node_model.get_metadata_prop("vivado_stitch_proj") + if vivado_stitched_proj is None or not Path(vivado_stitched_proj).exists(): + raise FINNUserError( + f"Call CreateStitchedIP prior to building " + f"the simulation for {self.model.graph.node[node_index].name}." + "(vivado_stitch_proj not set!)" + ) + + trace_file = cast("str | None", node_model.get_metadata_prop("rtlsim_trace")) + debug = not (trace_file is None or trace_file == "") + + # Get the module name and path + top_module_file = Path(wrapper_filename).resolve().absolute() + top_module_name = top_module_file.name.strip(".v") + + # Build the simulation .so and save it in the "rtlsim_so" metadata prop + sim_base, _ = self._create_sim_so( + node_model, top_module_name, Path(vivado_stitched_proj), build_dir, debug + ) + + # Fill out the simulation config header + _ = self._template_rtlsim_config( + node_model, + sim_base, + input_interface_names, + output_interface_names, + node_index, + total_nodes, + timeout_cycles, + top_module_name, + trace_file, + ) + + # Building the whole simulation + return self._compile_simulation(sim_base, silent=silent).absolute() + + def _build_simulations_parallel( + self, with_live_display: bool, functional_sim: bool + ) -> dict[int, Path]: + """Build all nodes in the model in parallel, as isolated simulations, ready for usage in + an IPC connected simulation chain. + + Args: + workers: Number of parallel workers to use. + with_live_display: If True, display the building progress in a rich progress bar. + functional_sim: Use a functional simulation (faster but takes time to build) + sim_type: Type of simulation + + Returns: + Dict of executables that start the simulation of the given nodes, + indexed by the node-index. These are in their respective FINN_TMP + directories. + """ + log.info(f"Building simulation binaries for {len(self.model.graph.node)} layers.") + + def _build( + node_index: int, + total_nodes: int, + build_dir: Path, + ) -> Any: + nodemodel = self._isolated_node_model(node_index) + nodemodel = nodemodel.transform(InferShapes()) + nodemodel = nodemodel.transform(PrepareIP(self.fpgapart, self.clk_ns)) + nodemodel = nodemodel.transform( + CreateStitchedIP(self.fpgapart, self.clk_ns, functional_simulation=functional_sim) + ) + input_interface_names = nodemodel.get_metadata_prop("predecessors") + if input_interface_names is not None: + input_interface_names = literal_eval(input_interface_names) + output_interface_names = nodemodel.get_metadata_prop("successors") + if output_interface_names is not None: + output_interface_names = literal_eval(output_interface_names) + return self.build_single_node_simulation( + nodemodel, + node_index, + total_nodes, + input_interface_names, + output_interface_names, + build_dir, + silent=with_live_display, + ) + + total_nodes = len(self.model.graph.node) + log.info(f"[BuildSimulation] Preparing to build {total_nodes} nodes for the simulation.") + futures: dict[int, Future] = {} + built_nodes = 0 + + # Progress display callback + def _callback_progress(name: str) -> Callable: + nonlocal total_nodes, built_nodes + + def _f(f: Future) -> None: + nonlocal total_nodes, built_nodes + built_nodes += 1 + log.info( + f"[ [bold green]" + f"{int(100.0 * float(built_nodes) / float(total_nodes))}%[/bold green]" + f" ] {name}", + extra={"markup": True, "highlighter": None}, + ) + # Unpack result once so that the pool fails immediately, instead of waiting for + # all futures to be completed. + f.result() + + return _f + + # Build sims in parallel + synth_workers = max( + 1, cast("int", (psutil.virtual_memory().free / 1024 / 1024 / 1024) // 10) + ) # 10GB per synthesis + if not functional_sim: + # When not having to do synthesis, the build is not memory bottlenecked and + # can be executed as parallel as possible + synth_workers = int(os.environ.get("NUM_DEFAULT_WORKERS", len(self.model.graph.node))) + + # Build (stitched IP, cmake, make) all sims in parallel and return paths to + # the compiled executables + log.info("[BuildSimulation] Starting the build process.") + with ThreadPoolExecutor(max_workers=synth_workers) as pool: + for i in range(total_nodes): + node_name = self.model.graph.node[i].name + futures[i] = pool.submit( + _build, + i, + total_nodes - 1, + Path(make_build_dir(f"rtlsim_{node_name}_")), + ) + futures[i].add_done_callback(_callback_progress(node_name)) + pool.shutdown(wait=True) + + # Check if all binaries were compiled successfully + binaries = {i: future.result() for i, future in futures.items()} + not_found_binaries = [] + for i, binary in binaries.items(): + if binary is None: + not_found_binaries.append(i) + if len(not_found_binaries) > 0: + raise FINNInternalError( + "Building simulations failed. " + "Failed simulation binaries: " + ", ".join(not_found_binaries) + ) + return binaries + + def build_simulation(self, with_live_display: bool, functional_sim: bool) -> dict[int, Path]: + """Build a simulation of the given type, return the path to the executable directory + (indexed by the corresponding node index in the graph). + + Args: + simtype: Simulation type to build. + workers: Number of workers to use in parallel. + Normally set by the Simulation() class automatically. + with_live_display: If True, display a live progress-bar. + functional_sim: If True, use functional simulation (faster but takes some time to build) + """ + return self._build_simulations_parallel(with_live_display, functional_sim) + + +class BuildSimulation(Transformation): + """Build a simulation of the given type for the model. + Puts the model into a prepared state (changes the graph). + If simulation binaries already exist, enter their directory and only re-compile.""" + + def __init__( + self, + fpgapart: str, + clk_ns: float, + functional_sim: bool, + ) -> None: + """Create a new BuildSimulation transform.""" + self.functional_sim = functional_sim + self.fpgapart = fpgapart + self.clk_ns = clk_ns + + def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, bool]: + """Build / compile the model. Modifies the model.""" + self.model = model + + # Check if we already have stitched IPs and built simulations. If so, rerun only cmake/make + needs_rebuild = True + sim_binaries = self.model.get_metadata_prop("simulation_binaries") + + # 1. Check if binary paths are saved in the model + if sim_binaries is not None: + sim_binaries = sim_binaries.split("\n") + + # 2. Check that the model size hasn't changed since creating the binaries. Otherwise + # we should rebuild. + if len(sim_binaries) != len(self.model.graph.node): + log.info( + f"[BuildSimulation] Found existing binaries, but number ({len(sim_binaries)}) " + f"does not match number of nodes in the graph " + f"({len(self.model.graph.node)}). Rebuilding..." + ) + else: + log.info("Existing simulations found. Re-running only CMake/Make..") + needs_rebuild = False + else: + log.info("[BuildSimulation] No simulation binaries found, building now.") + + # If needed, call the Builder to create the layer simulation binaries. + # This creates both the isolated and connected binaries in one go. + if needs_rebuild: + log.info("[BuildSimulation] Starting model preparation.") + self._prepare_model() + self.builder = SimulationBuilder(self.model, self.fpgapart, self.clk_ns) + sys.stdout = sys.stdout.console # type: ignore + self.binaries = self.builder.build_simulation( + with_live_display=False, + functional_sim=self.functional_sim, + ) + self.model.set_metadata_prop( + "simulation_binaries", "\n".join([str(p) for p in self.binaries.values()]) + ) + else: + # Run only compilation again, and avoid repeating building of the stitched IPs + def _compile(binary: Path) -> None: + result = subprocess.run( + "cmake .;make", + shell=True, + cwd=str(binary), + text=True, + capture_output=True, + ) + if result.returncode != 0: + raise FINNUserError(f"Failed compilation in {binary}: {result.stderr}") + + # Since we dont need a rebuild, sim_binaries contains the paths to the binaries + sim_binaries = [Path(p) for p in sim_binaries] + total = len(sim_binaries) + + # Prepare compiling the binaries again + done = 0 + + def _progress_callback(binary: str | Path) -> Callable: + nonlocal done, total + + def _f(future: Future) -> None: + nonlocal done, total + done += 1 + log.info( + f"[ [bold green]{int(100.0 * float(done) / float(total))}%[/bold green] ] " + f"Simulation [green italic]{binary}[/green italic] built.", + extra={"markup": True, "highlighter": None}, + ) + future.result() + + return _f + + # Run the compilation in parallel with the number of workers specified. + # If not specified, use 8 + compile_start = time.time() + futures: list[Future] = [] + with ThreadPoolExecutor(int(os.environ.get("NUM_DEFAULT_WORKERS", "8"))) as tpe: + for binary in sim_binaries: + futures.append(tpe.submit(_compile, binary)) + futures[-1].add_done_callback(_progress_callback(binary.name)) + tpe.shutdown() + compile_end = time.time() + log.info(f"Compilation done. Took {compile_end - compile_start} seconds") + return self.model, False + + def _prepare_model(self) -> None: + """Execute some preparation transformations on the model.""" + log.info("[BuildSimulation] Inserting DataWidthConverters...") + self.model = self.model.transform(InsertDWC()) + log.info("[BuildSimulation] Specializing layers...") + self.model = self.model.transform(SpecializeLayers(self.fpgapart)) + log.info("[BuildSimulation] Assigning unique and readable node and tensor names...") + self.model = self.model.transform(GiveUniqueNodeNames()) + self.model = self.model.transform(GiveReadableTensorNames()) + log.info("[BuildSimulation] Preparing IPs...") + self.model = self.model.transform(PrepareIP(self.fpgapart, self.clk_ns)) + log.info("[BuildSimulation] Synthesizing IPs...") + self.model = self.model.transform(HLSSynthIP()) + log.info("[BuildSimulation] Model preparation done.") diff --git a/src/finn/transformation/fpgadataflow/simulation_connected.py b/src/finn/transformation/fpgadataflow/simulation_connected.py new file mode 100644 index 0000000000..7aca8ac069 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/simulation_connected.py @@ -0,0 +1,1628 @@ +"""Node connected parallel simulations.""" + +import glob +import json +import math +import os +import pandas as pd +import time +import traceback +from concurrent.futures import Future, ThreadPoolExecutor +from copy import deepcopy +from enum import Enum +from pathlib import Path +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation +from rich.console import Console +from threading import Barrier +from typing import Any, cast + +from finn.builder.build_dataflow_config import DataflowBuildConfig +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.transformation.fpgadataflow.set_fifo_depths import get_fifo_split_configs +from finn.transformation.fpgadataflow.simulation import Simulation, SimulationType, store_fifo_data +from finn.transformation.fpgadataflow.simulation_controller import SimulationController +from finn.util.basic import make_build_dir +from finn.util.exception import FINNInternalError, FINNUserError +from finn.util.logging import log + + +# Hardware BRAM FIFOs lose entries to internal pipeline registers compared to the software FIFO +# model (which has exact capacity). This constant accounts for that overhead so that the +# minimization algorithm finds depths that are safe to deploy on hardware. +BRAM_FIFO_PIPELINE_OVERHEAD = 2 + + +def _count_bram_sub_fifos(depth: int, max_qsrl_depth: int) -> int: + """Return the number of BRAM (vivado) sub-FIFOs that *depth* decomposes into. + + Non-power-of-two BRAM FIFOs are decomposed into several power-of-two sub-FIFOs by + get_fifo_split_configs. Each sub-FIFO whose style is "vivado" has its own pipeline + register overhead, so the total overhead scales with the sub-FIFO count. + """ + return sum(1 for _, style in get_fifo_split_configs(depth, max_qsrl_depth) if style == "vivado") + + +def _safe_bram_starting_depth(peak_util: int, max_qsrl_depth: int) -> int: + """Return the smallest depth d such that d minus its BRAM pipeline overhead >= peak_util + 1. + + For LUTRAM depths (d <= max_qsrl_depth) the software model is exact so no overhead is needed. + For BRAM depths the overhead depends on how many sub-FIFOs the decomposition produces, + which itself depends on d. We iterate (typically 1-2 steps) until the overhead stabilises. + """ + d = max(peak_util + 1, 32) + if d <= max_qsrl_depth: + return d + # Iteratively find d where d - num_vivado(d)*overhead >= peak_util + 1 + overhead = 0 + while True: + d = peak_util + 1 + overhead + num_vivado = _count_bram_sub_fifos(d, max_qsrl_depth) + new_overhead = num_vivado * BRAM_FIFO_PIPELINE_OVERHEAD + if new_overhead <= overhead: + break + overhead = new_overhead + return max(d, 32) + + +class MinimizationOrder(Enum): + """The order in which the search algorithm minimizes the FIFO depths.""" + + NODE_ORDER = 0 + REVERSE_NODE_ORDER = 1 + LARGEST_BITWIDTH_DIFF_FIRST = 2 + SMALLEST_BITWIDTH_DIFF_FIRST = 3 + + # Non black-box model orders + AFTER_THRESHOLDS_FIRST = 4 + AFTER_DWC_FIRST = 5 + + # Half black-box + # If we ran a sim before, we know the largest FIFOs, so start with these. + # This strategy might work, if the changes to the model are small enough + REUSE_PREVIOUS_ORDER = 6 + + +class NodeConnectedSimulationController(SimulationController): + """Run simulations for node connected cases.""" + + def __init__( + self, + parallel_simulations: int, + names: list[str], + binaries: list[Path], + console: Console, + poll_interval: float = 1.0, + with_progressbar: bool = True, + ) -> None: + """Set up node connected simulation.""" + super().__init__( + parallel_simulations, names, binaries, console, poll_interval, with_progressbar + ) + # Synchronization barrier for configuration phase + self.sync_barrier: Barrier | None = None + for binary in binaries: + if not binary.exists(): + console.log(f"Binary {binary} does not exist!") + raise FINNUserError(f"Binary {binary} does not exist!") + + def _cleanup_shm_resources(self) -> None: + """Remove any existing shared memory segments and semaphores from /dev/shm.""" + try: + # Collect potential shared memory and semaphore names based on node names + shm_patterns = [] + # Pattern for shared memory segments (e.g., /nodename_0, /nodename_1) + shm_patterns.append("/dev/shm/*") + + removed_count = 0 + for pattern in shm_patterns: + for filepath in glob.glob(pattern): + try: + Path(filepath).unlink() + removed_count += 1 + except (FileNotFoundError, PermissionError): # noqa: PERF203 + # File might already be removed or we don't have permission + pass + + if removed_count > 0: + log.info(f"Cleaned up {removed_count} existing shared memory resources") + except Exception as e: + # Don't fail if cleanup fails - just log it + self.console.log(f"Warning: Error during shared memory cleanup: {e}") + + def run( + self, + depth: list[list[int]] | None = None, + output_json: Path | None = None, + max_cycles: int | None = None, + fifo_first_valid_cycles: list[list[int]] | None = None, + ) -> dict[str, list[int]]: + """Run the simulation entirely with the given depth and sample count. + + Args: + depth: FIFO depth to configure for simulations. + samples: Number of samples to simulate. + output_json: Optional path to write merged simulation data as JSON. + max_cycles: Max cycles + fifo_first_valid_cycles: First valid cycle for each FIFO (used for timeout detection) + + Returns: + Dictionary mapping simulation names to their FIFO utilization arrays. + """ + futures: list[Future] = [] + fifo_results: dict[str, list[int]] = {} + cycles_results: dict[str, int] = {} + samples_results: dict[str, int] = {} + intervals_results: dict[str, list[int]] = {} + timeout_result = False + fifo_depths: dict[str, list[int]] = {} + fifo_cycles_until_first_valid_results: dict[str, list[int]] = {} + + # Clean up any existing shared memory resources before starting + self._cleanup_shm_resources() + + # Initialize barrier for all simulations to synchronize after configuration + self.sync_barrier = Barrier(len(self.names)) + + if self.progress is not None: + self.progress.start() + try: + with ThreadPoolExecutor(self.workers) as pool: + for i, (name, binary) in enumerate(zip(self.names, self.binaries, strict=True)): + is_last_node = i == len(self.names) - 1 + is_special_for_display = i == 0 or is_last_node + futures.append( + pool.submit( + self._run_binary, + binary, + name, + i % len(os.sched_getaffinity(0)) + if len(os.sched_getaffinity(0)) < len(self.names) + else -1, # sched_getaffinity needed, because + # cpu_count does not handle well with workload schedulers. + # We only pin the core if we have more simulations than cores to avoid + # simulations moving around too much and hurting performance. If we have + # more cores than simulations, we leave it to the OS to schedule. + depth[i] if depth is not None else None, + is_last_node, # Only last node has no output FIFOs + is_special_for_display, # First and last get special coloring + max_cycles, + fifo_first_valid_cycles[i] + if fifo_first_valid_cycles is not None + else None, + ) + ) + + # Wait for first completion or error + from concurrent.futures import FIRST_COMPLETED, wait + + all_futures = list(futures) # Keep track of all futures + while futures: + done, futures = wait(futures, return_when=FIRST_COMPLETED) + + # Check if any completed task indicates we should stop + for future in done: + try: + result = future.result() # This will raise if there was an exception + if result is not None: + ( + sim_name, + fifo_util, + cycles, + samps, + intervals, + timeout, + fifo_depth, + fifo_cycles_until_first_valid, + ) = result + fifo_depths[sim_name] = fifo_depth + fifo_results[sim_name] = fifo_util + cycles_results[sim_name] = cycles + samples_results[sim_name] = samps + intervals_results[sim_name] = intervals + fifo_cycles_until_first_valid_results[sim_name] = ( + fifo_cycles_until_first_valid + ) + timeout_result = timeout_result or timeout + except Exception as e: # noqa + self.console.log(f"Simulation failed: {e}") + # Set stop flag and break + with self.stop_lock: + self.should_stop = True + break + + # If we should stop, signal all remaining simulations + with self.stop_lock: + if self.should_stop: + # Don't cancel - let them finish with early stop + break + + # Wait for all futures to complete and collect their results + pool.shutdown(wait=True) + for future in all_futures: + if not future.done(): + continue + try: + result = future.result() + if result is not None: + ( + sim_name, + fifo_util, + cycles, + samps, + intervals, + timeout, + fifo_depth, + fifo_cycles_until_first_valid, + ) = result + # Only update if not already collected + if sim_name not in fifo_results: + fifo_cycles_until_first_valid_results[sim_name] = ( + fifo_cycles_until_first_valid + ) + fifo_depths[sim_name] = fifo_depth + fifo_results[sim_name] = fifo_util + cycles_results[sim_name] = cycles + samples_results[sim_name] = samps + intervals_results[sim_name] = intervals + timeout_result = timeout_result or timeout + except Exception as e: + self.console.log(f"Error collecting result: {e}") + + # Detect nodes whose _run_binary returned None (subprocess + # crash / unhandled exception). Their names were never inserted into + # fifo_results, so the merged JSON would contain empty 'intervals' lists + # for those nodes. _check_performance would then silently return False + # (no degradation detected) and the minimisation algorithm would treat a + # failed simulation as a successful one. Mark the run as timed-out so + # that _test_depth correctly rejects the candidate depth. + missing_nodes = [name for name in self.names if name not in fifo_results] + if missing_nodes: + self.console.log( + f"[bold red]WARNING: simulation results missing for node(s) " + f"{missing_nodes} (subprocess likely crashed). " + f"Marking run as timed-out to prevent false-success " + f"classification.[/bold red]" + ) + timeout_result = True + finally: + if self.progress is not None: + self.progress.stop() + self._cleanup_sockets() + + # Merge all simulation data + if output_json is not None: + merged_data = { + "simulations": [ + { + "name": name, + "fifo_utilization": fifo_results.get(name, []), + "fifo_depth": fifo_depths.get(name, []), + "cycles": cycles_results.get(name, 0), + "samples": samples_results.get(name, 0), + "intervals": intervals_results.get(name, []), + "fifo_cycles_until_first_valid": fifo_cycles_until_first_valid_results.get( + name, [] + ), + } + for name in self.names + ], + "depth_configured": depth, + "timeout_occurred": timeout_result, + } + output_json.write_text(json.dumps(merged_data, indent=2)) + + return fifo_results + + def _run_binary( + self, + binary: Path, + name: str | None, + _cpu: int | None, + depth: list[int] | None = None, + is_last_node: bool = False, + is_special_for_display: bool = False, + max_cycles: int | None = None, + fifo_first_valid_cycles: list[int] | None = None, + ) -> tuple[str, list[int], int, int, list[int], bool, list[int], list[int]] | None: + """Run the specified simulation binary in a new subprocess and communicate with it. + + Args: + binary: Path to simulation binary + name: Name of simulation node + _cpu: CPU affinity (unused) + depth: List of FIFO depths for this node's output FIFOs + is_last_node: True if this is the last node (no output FIFOs to configure) + is_special_for_display: True if this node should get special color in logs + max_cycles: Maximum cycles to simulate + fifo_first_valid_cycles: First valid cycle for each FIFO (used for timeout detection) + + Returns: + Tuple of (simulation_name, fifo_utilization, cycles, samples, intervals, timeout, + fifo_depth, fifo_cycles_until_first_valid) on success, + None on failure. + """ + cwd = binary.parent + if name is None: + name = cwd.name.replace("rtlsim_", "") + + process_index = self.names.index(name) + + with (self.logdir / f"{name}_{process_index}_of_{self.total}.txt").open("w+") as logfile: + + def _print(msg: str, color: str = "green") -> None: + if self.progress is None: + if is_special_for_display: + color = "orange3" + if "ERROR" in msg: + color = "red" + log.debug( + f"[bold {color}]{name:<35}" + f"[/bold {color}][cornflower_blue]{process_index} " + f"/ {len(self.names) - 1}[/cornflower_blue] {msg:<35}" + ) + logfile.write(f"{msg}\n") + logfile.flush() + + try: + # Start the simulation process with socket communication + proc_idx = self._start_process( + binary, process_index, cpu=_cpu if _cpu is not None else -1 + ) + + # Send configuration commands + # Last node has no output FIFOs, so don't configure FIFO depths + config_payload: dict[str, list[int] | int] = {} + if not is_last_node and depth is not None: + config_payload["fifo_depth"] = depth + if max_cycles is not None: + config_payload["max_cycles"] = max_cycles + if not is_last_node and fifo_first_valid_cycles is not None: + config_payload["fifo_first_valid_cycles"] = fifo_first_valid_cycles + + response = self._send_and_receive(proc_idx, "configure", config_payload) + + if not response or response.get("status") != "success": + error_msg = ( + response.get("message", "Unknown error") if response else "No response" + ) + _print(f"Configuration failed: {error_msg}", "red") + return None + + # Wait for all simulations to complete configuration before starting + _print("Waiting for all simulations to complete configuration...") + if self.sync_barrier is not None: + self.sync_barrier.wait() + _print("All simulations configured, starting...") + + # Start the simulation + response = self._send_and_receive(proc_idx, "start", {}) + + if not response or response.get("status") != "success": + error_msg = ( + response.get("message", "Unknown error") if response else "No response" + ) + _print(f"Failed to start simulation: {error_msg}", "red") + return None + + cycles = 0 + samps = 0 + intervals: list[int] = [] + timeout = False + fifo_util: list[int] = [] + fifo_depth: list[int] = [] + fifo_cycles_until_first_valid: list[int] = [] + + # Poll for status updates + while True: + # Check if we should stop early + with self.stop_lock: + if self.should_stop: + try: + stop_response = self._send_and_receive(proc_idx, "stop", {}) + except (BrokenPipeError, ConnectionResetError, RuntimeError): + # Process may have already exited - that's ok during shutdown + stop_response = None + if stop_response: + cycles = stop_response.get("cycles", 0) + samps = stop_response.get("samples", 0) + fifo_util = stop_response.get("fifo_utilization", []) + intervals = stop_response.get("intervals", []) + fifo_depth = stop_response.get("fifo_depth", []) + timeout = stop_response.get("timeout", False) + fifo_cycles_until_first_valid = stop_response.get( + "fifo_cycles_until_first_valid", [] + ) + if fifo_util: + logfile.write(f"Final FIFO utilization: {fifo_util}\n") + return ( + name, + fifo_util, + cycles, + samps, + intervals, + timeout, + fifo_depth, + fifo_cycles_until_first_valid, + ) + time.sleep(self.poll_interval) + + response = self._send_and_receive(proc_idx, "status", {}) + + if not response: + _print("Lost connection to simulation", "red") + with self.stop_lock: + self.should_stop = True + raise RuntimeError("Lost connection to simulation") + + state = response.get("state", "unknown") + + if state == "finished" or state == "timeout": + cycles = response.get("cycles", 0) + samps = response.get("samples", 0) + fifo_util = response.get("fifo_utilization", []) + fifo_depth = response.get("fifo_depth", []) + intervals = response.get("intervals", []) + timeout = response.get("timeout", False) + fifo_cycles_until_first_valid = response.get( + "fifo_cycles_until_first_valid", [] + ) + with self.stop_lock: + self.should_stop = True + break + + if state == "running": + # Update progress if available + cycles = response.get("cycles", 0) + + if state == "error": + error_msg = response.get("message", "Unknown error") + _print(f"Simulation error: {error_msg}", "red") + # Signal other simulations to stop + with self.stop_lock: + self.should_stop = True + raise RuntimeError(f"Simulation error: {error_msg}") + + # Stop the simulation + stop_response = self._send_and_receive(proc_idx, "stop", {}) + + if stop_response: + fifo_util = stop_response.get("fifo_utilization", []) + fifo_depth = stop_response.get("fifo_depth", []) + cycles = stop_response.get("cycles", 0) + samps = stop_response.get("samples", 0) + fifo_cycles_until_first_valid = stop_response.get( + "fifo_cycles_until_first_valid", [] + ) + if fifo_util: + logfile.write(f"Final FIFO utilization: {fifo_util}\n") + + return ( + name, + fifo_util, + cycles, + samps, + intervals, + timeout, + fifo_depth, + fifo_cycles_until_first_valid, + ) + + except Exception as e: + self.console.log(f"Exception caught during simulation execution ({name}): {e}") + self.console.log(traceback.format_exc()) + logfile.write(f"Exception: {e}\n") + logfile.write(traceback.format_exc()) + with self.stop_lock: + self.should_stop = True + return None + + +class NodeConnectedSimulation(Simulation): + def __init__( + self, + model: ModelWrapper, + simulation_type: SimulationType, + fpgapart: str, + clk_ns: float, + functional_sim: bool, + workers: int | None = None, + max_qsrl_depth: int = 256, + ) -> None: + super().__init__(model, simulation_type, fpgapart, clk_ns, functional_sim, workers) + self.max_qsrl_depth = max_qsrl_depth + + def simulate( + self, + depth: int | list[list[int]] | None = None, + max_cycles: int | None = None, + fifo_first_valid_cycles: list[list[int]] | None = None, + ) -> tuple[list[dict[str, list[int]]], bool]: + """Simulate the given number of samples for every layer. Layers are completely isolated + and simulated in parallel. + Simulation data is returned as a list of dicts (by node name as index). + """ + if self.simulation_type != SimulationType.NODE_BASED_CONNECTED: + raise FINNInternalError( + f"Called simulation function 'simulate_node_connected' " + f"does not match provided simulation type " + f"{self.simulation_type}" + ) + names = [node.name for node in self.model.graph.node] + initial_depth: Any = [[depth]] * len(self.binaries) if isinstance(depth, int) else depth + + # For BRAM FIFOs (depth > max_qsrl_depth), hardware loses BRAM_FIFO_PIPELINE_OVERHEAD + # entries to internal pipeline registers *per BRAM sub-FIFO*. Non-power-of-two depths + # are decomposed into several power-of-two sub-FIFOs (see get_fifo_split_configs), so + # the total overhead is num_bram_sub_fifos * BRAM_FIFO_PIPELINE_OVERHEAD. + # Rounding to a full BRAM block before calling get_fifo_split_configs is NOT needed: + # the decomposition works on any depth, and we want the sub-FIFO count for the exact + # depth under test. + if initial_depth is not None and not isinstance(initial_depth, int): + adjusted_depth: Any = [ + [ + d - _count_bram_sub_fifos(d, self.max_qsrl_depth) * BRAM_FIFO_PIPELINE_OVERHEAD + if d > self.max_qsrl_depth + else d + for d in node_depths + ] + for node_depths in initial_depth + ] + else: + adjusted_depth = initial_depth + + # Run simulation + start = time.time() + output_json = Path(make_build_dir("simulation_results_")) / "simulation_data.json" + controller = NodeConnectedSimulationController( + len(self.binaries), names, list(self.binaries.values()), Console(), 0.1, False + ) + controller.run(adjusted_depth, output_json, max_cycles, fifo_first_valid_cycles) + end = time.time() + log.debug(f"Simulation took {end - start} seconds!") + + # Load the merged data from JSON + merged_data = json.loads(output_json.read_text()) + + # Return the collected data indexed by node index + data = [] + for sim_entry in merged_data["simulations"]: + data.append( + { + "name": sim_entry["name"], + "fifo_utilization": sim_entry["fifo_utilization"], + "fifo_depth": sim_entry["fifo_depth"], + "cycles": sim_entry["cycles"], + "samples": sim_entry["samples"], + "intervals": sim_entry["intervals"], + "fifo_cycles_until_first_valid": sim_entry["fifo_cycles_until_first_valid"], + } + ) + json.dump(data, output_json.open("w"), indent=4) + return data, merged_data.get("timeout_occurred", False) + + +class RunLayerParallelSimulation(Transformation): # noqa + def __init__( + self, + fpgapart: str, + clk_ns: float, + cfg: DataflowBuildConfig, + minimization_orders: list[MinimizationOrder] | None = None, + max_qsrl_depth: int = 256, + vivado_ram_style: str = "auto", + quality_of_results: str = "default", + ) -> None: + """Run layer parallel simulations.""" + super().__init__() + self.fpgapart = fpgapart + self.clk_ns = clk_ns + self.cfg = cfg + self.max_qsrl_depth = max_qsrl_depth + self.vivado_ram_style = vivado_ram_style + self.quality_of_results = quality_of_results + if minimization_orders is not None: + self.minimization_orders = minimization_orders + else: + # TODO: Set to ALL search orders + self.minimization_orders = [MinimizationOrder.NODE_ORDER] + + self.final_depths: dict[MinimizationOrder, list[list[int]] | None] = dict.fromkeys( + self.minimization_orders + ) + + def create_starting_fifo_depths( + self, initial_fifo_depths: list[dict[str, list[int]]] + ) -> tuple[list[list[int]], list[list[int]]]: + """From the given initial_fifo_depths returned by the simulation, create a starting + FIFO depth configuration that can be modified sequentially by the minimization algorithm. + Also return the fifo_first_valid_cycles. + """ + # Create fifo_depths (indexed by layer index and then stream index) + fifo_depths: list[list[int]] = [] # Each entry is a list of fifo sizes for that node + for val in initial_fifo_depths: + # Use _safe_bram_starting_depth so that simulate() (which subtracts + # num_sub_fifos*BRAM_FIFO_PIPELINE_OVERHEAD for BRAM depths) still sees a depth + # that covers the observed peak utilisation. A flat +2 is insufficient when a + # depth decomposes into multiple BRAM sub-FIFOs (e.g. depth 1537 → 2 sub-FIFOs + # → 4 entries of overhead). + fifo_depths.append( + [_safe_bram_starting_depth(v, self.max_qsrl_depth) for v in val["fifo_utilization"]] + ) + fifo_first_valid_cycles: list[list[int]] = [] + for val in initial_fifo_depths: + fifo_first_valid_cycles.append( + [v + math.ceil(v * 0.01) for v in val["fifo_cycles_until_first_valid"]] + ) # Add 1% cycles grace period + return fifo_depths, fifo_first_valid_cycles + + def get_minimization_order_indices( + self, + min_order: MinimizationOrder, + model: ModelWrapper, + bitwidths: list[int], + ) -> list[int]: + """Given a MinimizationOrder, return the list of indices to + access/minimize `fifo_depths` for that order. For example, NODE_ORDER would return + [0,1,2,...] and NODE_ORDER_REVERSED [N, N-1, N-2, ..., 0]. + """ + assert len(model.graph.node) == len(bitwidths) + match min_order: + case MinimizationOrder.NODE_ORDER: + return list(range(len(model.graph.node))) + case MinimizationOrder.REVERSE_NODE_ORDER: + return list(range(len(model.graph.node)))[::-1] + case ( + MinimizationOrder.LARGEST_BITWIDTH_DIFF_FIRST + | MinimizationOrder.SMALLEST_BITWIDTH_DIFF_FIRST + ): + diffs: list[tuple[int, int]] = [] # (index, diff) + for i in range(len(model.graph.node)): + hw: HWCustomOp = getCustomOp(model.graph.node[i]) + in_width = max( + [hw.get_instream_width(j) for j in range(len(model.graph.node[i].input))] + ) + out_width = max( + [hw.get_outstream_width(j) for j in range(len(model.graph.node[i].output))] + ) + diffs.append((i, in_width - out_width)) + sorted_order = sorted( + diffs, + key=lambda x: x[1], + reverse=(min_order == MinimizationOrder.LARGEST_BITWIDTH_DIFF_FIRST), + ) + return [idx for idx, diff in sorted_order] + case _: + raise NotImplementedError() + + def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, bool]: + """Run layer parallel simulations.""" + sim = NodeConnectedSimulation( + model, + SimulationType.NODE_BASED_CONNECTED, + self.fpgapart, + self.clk_ns, + self.cfg.functional_simulation, + max_qsrl_depth=self.max_qsrl_depth, + ) + model = sim.model # TODO:clean up + + # Create empty table for datapoints that will be collected + # First create as a nested dict, since not all data is avilable at the same time + # It is then flattened when creating the dataframe, so that node and stream are columns too + # df_data[node][stream_idx][columnm] = ... + df_data: dict[str, list[dict[str, Any]]] = {} + for nodeindex, node in enumerate(model.graph.node): + df_data[node.name] = [] + for node_idx in range(len(node.output)): + df_data[node.name].append( + { + "onnx_index": nodeindex, + "out_bitwidth": -1, + "out_initial_fifo_depths": -1, + "fifo_cycles_until_first_valid": -1, + "successor_node": ", ".join( + [node.name for node in model.find_consumers(node.output[node_idx])] + ), + } + ) + for min_order in self.minimization_orders: + df_data[node.name][-1][f"out_final_depth_{min_order.name}"] = -1 + df_data[node.name][-1][f"simulation_time_{min_order.name}"] = -1 + df_data[node.name][-1][f"minimization_iterations_{min_order.name}"] = -1 + + # TODO: The final depths contained a lot of -1 (default values). + # Did we need to write the initial depths into there? + # Or in case of minimization skip we likely need to write the values still. + + # Running the initial simulation + log.info("Running initial node-connected simulation.") + initial_fifo_depths, _ = sim.simulate() + + # Store the initial sizes as a report + initial_sizes_path = ( + Path(self.cfg.output_dir) / "report" / "initial_fifo_sizes_sim_connected.json" + ) + initial_sizes_path.write_text(json.dumps(initial_fifo_depths, indent=4)) + log.info(f"Wrote initial sizes to: {initial_sizes_path}") + + # Store initial sizes in dataframe as well + for layerdata in initial_fifo_depths: + for idx in range(len(layerdata["fifo_utilization"])): + name: str = cast("str", layerdata["name"]) + df_data[name][idx]["out_initial_fifo_depths"] = layerdata["fifo_utilization"][idx] + df_data[name][idx]["fifo_cycles_until_first_valid"] = layerdata[ + "fifo_cycles_until_first_valid" + ][idx] + + # List of list of fifo depths + fifo_depths, fifo_first_valid_cycles = self.create_starting_fifo_depths(initial_fifo_depths) + + # Max cycles for any simulation + sim_cycles: int = cast("int", max([val["cycles"] for val in initial_fifo_depths])) + + # Extract bitwidths from outstream widths of hw nodes + bit_widths = [] + for node_idx in range(len(fifo_depths)): + bit_widths.append([]) + hw_node = getCustomOp(model.graph.node[node_idx]) + if isinstance(hw_node, HWCustomOp): + for fifo_idx in range(len(fifo_depths[node_idx])): + bit_widths[node_idx].append(hw_node.get_outstream_width(fifo_idx)) + else: + raise FINNInternalError("Non-HW node found in dataflow graph during simulation") + + # Store bitwidths into dataframe as well + for node_idx in range(len(bit_widths)): + for fifo_idx in range(len(bit_widths[node_idx])): + df_data[model.graph.node[node_idx].name][fifo_idx]["out_bitwidth"] = bit_widths[ + node_idx + ][fifo_idx] + + # Run minimization for every layer/stream + log.info("Minimizing layers...") + needs_minimization = [] + for node_idx in range(len(fifo_depths)): + needs_minimization.append([True] * len(fifo_depths[node_idx])) + for node_idx in range(len(fifo_depths)): + for fifo_idx in range(len(fifo_depths[node_idx])): + # Check if we can reduce the fifo size + + used_size = fifo_depths[node_idx][fifo_idx] + bw = bit_widths[node_idx][fifo_idx] + + needs_minimization[node_idx][fifo_idx] = self._needs_minimization(used_size, bw) + + # Total minimizations + total_minimizations = sum(len(streams) for streams in fifo_depths) + + for k, minimization_order in enumerate(self.minimization_orders): + # Create a new empty FIFO depth list + fifo_depths, fifo_first_valid_cycles = self.create_starting_fifo_depths( + initial_fifo_depths + ) + + # Minimize FIFO depths using binary search over BRAM block counts + idx_order = self.get_minimization_order_indices(minimization_order, model, bit_widths) + if len(idx_order) != len(model.graph.node): + raise FINNInternalError( + f"Expected index order length {len(model.graph.node)}, but got {len(idx_order)}" + ) + + log.info( + f"Minimizing using order: {minimization_order.name}. Index order is: {idx_order}" + ) + + done = 0 + for node_idx in idx_order: + for fifo_idx in range(len(fifo_depths[node_idx])): + if not needs_minimization[node_idx][fifo_idx]: + df_data[model.graph.node[node_idx].name][fifo_idx][ + f"simulation_time_{minimization_order.name}" + ] = 0.0 + df_data[model.graph.node[node_idx].name][fifo_idx][ + f"out_final_depth_{minimization_order.name}" + ] = fifo_depths[node_idx][fifo_idx] + df_data[model.graph.node[node_idx].name][fifo_idx][ + f"minimization_iterations_{minimization_order.name}" + ] = 0 + log.info( + f"[ {node_idx}.{fifo_idx} / {len(fifo_depths) - 1} ] " + f"Skipping minimization for this stream." + ) + done += 1 + continue + + minimization_start = time.time() + minimized_depth, iterations_needed = self._minimize_fifo_depth( + node_idx, + fifo_idx, + fifo_depths, # current_depths: evolves as FIFOs are minimised + bit_widths, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + ) + minimization_time = time.time() - minimization_start + + # Store the minimized size + fifo_depths[node_idx][fifo_idx] = minimized_depth + done += 1 + + # Store data into dataframe + df_data[model.graph.node[node_idx].name][fifo_idx][ + f"simulation_time_{minimization_order.name}" + ] = minimization_time + df_data[model.graph.node[node_idx].name][fifo_idx][ + f"minimization_iterations_{minimization_order.name}" + ] = iterations_needed + df_data[model.graph.node[node_idx].name][fifo_idx][ + f"out_final_depth_{minimization_order.name}" + ] = fifo_depths[node_idx][fifo_idx] + log.debug( + f"Set node/stream {node_idx}.{fifo_idx} to " + f"depth {fifo_depths[node_idx][fifo_idx]}, in " + f"{iterations_needed} iterations and {minimization_time} " + f"seconds. (To {minimization_order.name})" + ) + + percentage = int(100.0 * float(done) / float(total_minimizations)) + log.info( + f"[ [bold green]{percentage}%[/bold green] ] " + f"[ {node_idx}.{fifo_idx} / {len(fifo_depths) - 1} ] Simulation completed " + f"({iterations_needed} iterations).", + extra={"markup": True, "highlighter": None}, + ) + + self.final_depths[minimization_order] = deepcopy(fifo_depths) + + order_percent = int(100.0 * float(k + 1) / float(len(self.minimization_orders))) + log.info( + f"[ [bold gold1]{order_percent}%[/bold gold1] ] " + f"----- Minimization order {minimization_order.name} completed -----", + extra={"markup": True, "highlighter": None}, + ) + + # Store dataframe + df_keys = list(df_data[model.graph.node[0].name][0].keys()) + log.debug(f"Saving keys: {df_keys} + [node, stream]") + df_dict = {} + df_dict["node"] = [] + df_dict["stream"] = [] + for k in df_keys: + df_dict[k] = [] + for node, nodedata in df_data.items(): + for streamindex, streamdata in enumerate(nodedata): + df_dict["node"].append(node) + df_dict["stream"].append(streamindex) + for key in streamdata.keys(): + df_dict[key].append(streamdata[key]) + + df = pd.DataFrame(df_dict) + model = store_fifo_data( + model, + df, + Path(self.cfg.output_dir) / "report" / "fifo_data.csv", + delete_existing=False, + store_html=True, + ) + + # Use the smallest fifo depths found (by total bytes) + smallest_order = self.minimization_orders[0] + smallest_size = None + for order in self.minimization_orders: + current_size = 0 + depths = self.final_depths[order] + if depths is None: + raise FINNInternalError( + f"Expected FIFO sizes for minimization order {order.name}, but found None." + ) + for node_idx in range(len(depths)): + for fifo_idx in range(len(depths[node_idx])): + current_size += depths[node_idx][fifo_idx] * bit_widths[node_idx][fifo_idx] + + if smallest_size is None or current_size < smallest_size: + smallest_size = current_size + smallest_order = order + + # Set the result fifo depths + fifo_depths = self.final_depths[smallest_order] + assert fifo_depths is not None + + # Make sure that all FIFOs with depth > 256 use a full BRAM block, + # since partial blocks are not supported by Vivado HLS + for node_idx in range(len(fifo_depths)): + for fifo_idx in range(len(fifo_depths[node_idx])): + if fifo_depths[node_idx][fifo_idx] > self.max_qsrl_depth: + bw = bit_widths[node_idx][fifo_idx] + blocks = calculate_bram_blocks(fifo_depths[node_idx][fifo_idx], bw) + # if len(fifo_depths[i]) > 1: + # blocks_plus_one = self._get_valid_block_counts( + # blocks + 1, blocks + 1000, bw + # ) + # _, max_d = calculate_bram_depth_range(blocks_plus_one[0], bw) + # else: + _, max_d = calculate_bram_depth_range(blocks, bw) + fifo_depths[node_idx][fifo_idx] = max_d + + log.info("Final FIFO depths:") + for node_idx in range(len(fifo_depths)): + log.info(f"{node_idx}: {fifo_depths[node_idx]}") + + log.info("Running final end-to-end validation simulation with minimised FIFO depths...") + validation_data, validation_timeout = sim.simulate( + fifo_depths, + max_cycles=math.ceil(sim_cycles * 1.05), + fifo_first_valid_cycles=fifo_first_valid_cycles, + ) + if validation_timeout: + raise FINNUserError( + "Final validation simulation timed out with the jointly-minimised FIFO depths. " + "The per-FIFO minimisation may have produced a configuration that is " + "collectively too small. Re-run with a larger initial depth or fewer " + "minimisation orders." + ) + if self._check_performance(validation_data, initial_fifo_depths): + raise FINNUserError( + "Final validation simulation detected throughput degradation with the " + "jointly-minimised FIFO depths (intervals exceeded baseline). " + "The per-FIFO minimisation may have produced a configuration that is " + "collectively too small. Re-run with a larger initial depth or fewer " + "minimisation orders." + ) + log.info("Final validation simulation passed - minimised depths are correct.") + + # Write back results. By default write to output_dir / "fifo_config.json" + writeback_path = Path(self.cfg.output_dir) / "fifo_config.json" + assert len(fifo_depths) == len(model.graph.node) + json_results = [] + for node_idx, node in enumerate(model.graph.node): + json_results.append({"node": node.name, "depths": fifo_depths[node_idx]}) + with writeback_path.open("w") as f: + json.dump(json_results, f) + log.info(f"Wrote results back to {writeback_path}") + + return model, False + + def _check_performance( + self, new_data: list[dict[str, list[int]]], initial_fifo_depths: list[dict[str, list[int]]] + ) -> bool: + """Check if performance has degraded compared to baseline. + + Args: + new_data: Simulation results to check + initial_fifo_depths: Baseline performance data + + Returns: + True if performance degraded, False otherwise + """ + for new, initial in zip(new_data, initial_fifo_depths, strict=True): + if len(new["intervals"]) != len(initial["intervals"]): + raise FINNInternalError( + "New simulation data has different number of streams than baseline." + ) + for idx in range(len(new["intervals"])): + if new["intervals"][idx] > initial["intervals"][idx]: + return True + return False + + def _test_depth( + self, + test_depth: int, + node_idx: int, + fifo_idx: int, + current_depths: list[list[int]], + initial_fifo_depths: list[dict[str, list[int]]], + sim: NodeConnectedSimulation, + sim_cycles: float, + fifo_first_valid_cycles: list[list[int]], + ) -> tuple[bool, bool]: + """Test a specific FIFO depth. + + Args: + test_depth: Depth to test + node_idx: Node index + fifo_idx: FIFO index within node + current_depths: Current working FIFO depth configuration. FIFOs that have + already been minimised contain their final minimised depth; FIFOs not yet + processed still carry the safe starting depth. This list is never + modified by this method - a deep copy is made before inserting + ``test_depth``. + initial_fifo_depths: Baseline performance data + sim: Simulation controller + sim_cycles: Maximum simulation cycles + fifo_first_valid_cycles: First valid cycle for each FIFO + Returns: + Tuple of (success, timeout) where success means depth works without degradation + """ + test_depths = deepcopy(current_depths) + test_depths[node_idx][fifo_idx] = test_depth + + new_simulation_data, timeout = sim.simulate( + test_depths, + max_cycles=min( + math.ceil(sim_cycles * 1.05), math.ceil(sim_cycles) + 10 * len(test_depths) + ), + fifo_first_valid_cycles=fifo_first_valid_cycles, + ) + + if timeout: + return False, True + + performance_degraded = self._check_performance(new_simulation_data, initial_fifo_depths) + return not performance_degraded, False + + def _get_valid_block_counts(self, min_blocks: int, max_blocks: int, bitwidth: int) -> list[int]: + """Get all valid BRAM block counts in the specified range. + + Some block counts are invalid for certain bitwidths due to quantization. + This method returns only the valid configurations. + + Args: + min_blocks: Minimum block count (inclusive) + max_blocks: Maximum block count (inclusive) + bitwidth: Data bitwidth + + Returns: + Sorted list of valid block counts + """ + valid_blocks = [] + for blocks in range(min_blocks, max_blocks + 1): + _, max_d = calculate_bram_depth_range(blocks, bitwidth) + if max_d > 0: # Valid configuration + valid_blocks.append(blocks) + return valid_blocks + + def _minimize_fifo_depth( + self, + node_idx: int, + fifo_idx: int, + current_depths: list[list[int]], + bit_widths: list[list[int]], + initial_fifo_depths: list[dict[str, list[int]]], + sim: NodeConnectedSimulation, + sim_cycles: int, + fifo_first_valid_cycles: list[list[int]], + ) -> tuple[int, int]: + """Minimize a single FIFO depth using binary search. + + Args: + node_idx: Node index + fifo_idx: FIFO index within node + current_depths: Current working FIFO depth configuration. FIFOs that have + already been minimised in this pass carry their final minimised depth; + FIFOs not yet processed still carry the safe starting depth. This list + is mutated by the caller (``apply``) after each call to store the + minimised result, so successive calls see the evolving state. + bit_widths: Bitwidths for all FIFOs + initial_fifo_depths: Baseline performance data + sim: Simulation controller + sim_cycles: Maximum simulation cycles + fifo_first_valid_cycles: First valid cycle for each FIFO + Returns: + Tuple: Minimized FIFO depth, Iterations required to arrive at the result + """ + iterations = 0 + original_size = current_depths[node_idx][fifo_idx] + bw = bit_widths[node_idx][fifo_idx] + + log.debug(f"Minimizing Node {node_idx} FIFO {fifo_idx}: original depth {original_size}") + + # If FIFO depth of 32 works, use it because it fits into bw/2 LUTs + success, timeout = self._test_depth( + 32, + node_idx, + fifo_idx, + current_depths, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + ) + iterations += 1 + if success: + return 32, iterations + + if original_size <= self.max_qsrl_depth: + upper_luts = calculate_srl16e_luts(original_size, bw) + # LUTRAM based FIFOs have block sizes of 32, so smallest after 32 is 64 + lower_luts = calculate_srl16e_luts(64, bw) + + # Binary search if there's room to search + if upper_luts > lower_luts: + best_working_depth, bin_it = self._binary_search_srl_depth( + node_idx, + fifo_idx, + current_depths, + bw, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + lower_luts=lower_luts, + upper_luts=upper_luts, + ) + iterations += bin_it + return best_working_depth, iterations + return original_size, iterations + + # Try FIFO depth of 256 next (fits into LUTRAM) + success, timeout = self._test_depth( + self.max_qsrl_depth, + node_idx, + fifo_idx, + current_depths, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + ) + iterations += 1 + if success: + upper_luts = calculate_srl16e_luts(self.max_qsrl_depth, bw) + # LUTRAM based FIFOs have block sizes of 32, so smallest after 32 is 64 + lower_luts = calculate_srl16e_luts(64, bw) + + # Binary search if there's room to search + if upper_luts > lower_luts: + best_working_depth, bin_it = self._binary_search_srl_depth( + node_idx, + fifo_idx, + current_depths, + bw, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + lower_luts=lower_luts, + upper_luts=upper_luts, + ) + iterations += bin_it + return best_working_depth, iterations + return self.max_qsrl_depth, iterations + + # We know 256 doesn't work, so we have to use BRAMs + # Try one BRAM block less than current + upper_blocks = calculate_bram_blocks(original_size, bw) + # Get all valid block counts in the range + valid_blocks = self._get_valid_block_counts(1, upper_blocks - 1, bw) + if not valid_blocks: + # No valid configurations exist + return original_size, iterations + # Test the maximum valid block count first + # (largest depth below original, most likely to succeed) + max_valid_blocks = valid_blocks[-1] + _, max_d = calculate_bram_depth_range(max_valid_blocks, bw) + + success, timeout = self._test_depth( + max_d, + node_idx, + fifo_idx, + current_depths, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + ) + iterations += 1 + + if timeout or not success: + return original_size, iterations + + best_working_depth = max_d + + # Binary search if there's room to search and multiple valid configs + if len(valid_blocks) > 1: + best_working_depth, bin_it = self._exponential_binary_search_depth( + node_idx, + fifo_idx, + current_depths, + bw, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + valid_blocks=valid_blocks, + ) + iterations += bin_it + + return best_working_depth, iterations + + def _exponential_binary_search_depth( + self, + node_idx: int, + fifo_idx: int, + current_depths: list, + bitwidth: int, + initial_fifo_depths: list[dict[str, list[int]]], + sim: NodeConnectedSimulation, + sim_cycles: float, + fifo_first_valid_cycles: list[list[int]], + valid_blocks: list[int], + ) -> tuple[int, int]: + """Perform exponential + binary search over valid block configurations. + + Uses exponential search to quickly find the range, then binary search within it. + This is more efficient when smaller block counts are more likely. + Only searches over pre-validated block counts. + + Args: + node_idx: Node index + fifo_idx: FIFO index within node + current_depths: Current working FIFO depth configuration. FIFOs already + minimised in this pass carry their final depth; this list must not be + modified directly (``_test_depth`` deep-copies it before trial edits). + bitwidth: Data bitwidth + initial_fifo_depths: Baseline performance data + sim: Simulation controller + sim_cycles: Maximum simulation cycles + fifo_first_valid_cycles: First valid cycle for each FIFO + valid_blocks: Sorted list of valid block counts to search over + + Returns: + Tuple: Best working depth found, Number of iterations required to arrive at this result. + """ + iterations = 0 + if not valid_blocks: + raise FINNInternalError("valid_blocks list cannot be empty") + + # Start with the largest valid block count (known to work from caller) + _, max_d = calculate_bram_depth_range(valid_blocks[-1], bitwidth) + best_working_depth = max_d + + # Exponential search phase: find range where solution exists + # Check positions: 0, 1, 2, 4, 8, ... indices in valid_blocks list + lower_idx = 0 + upper_idx = len(valid_blocks) - 1 + exp_idx = 0 + last_failed_idx = -1 + + while exp_idx < upper_idx: + blocks = valid_blocks[exp_idx] + _, max_d = calculate_bram_depth_range(blocks, bitwidth) + + success, _ = self._test_depth( + max_d, + node_idx, + fifo_idx, + current_depths, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + ) + iterations += 1 + + if success: + # Found a working depth, now binary search in [last_failed_idx+1, exp_idx] + best_working_depth = max_d + lower_idx = last_failed_idx + 1 + upper_idx = exp_idx + break + # This doesn't work, try exponentially larger index + last_failed_idx = exp_idx + exp_idx = min(exp_idx * 2 if exp_idx > 0 else 1, upper_idx) + + # Binary search phase: refine the range + while lower_idx < upper_idx: + mid_idx = (lower_idx + upper_idx) // 2 + blocks = valid_blocks[mid_idx] + _, max_d = calculate_bram_depth_range(blocks, bitwidth) + + success, _ = self._test_depth( + max_d, + node_idx, + fifo_idx, + current_depths, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + ) + iterations += 1 + + if success: + # This depth works, try smaller (lower indices) + best_working_depth = max_d + upper_idx = mid_idx + else: + # This depth doesn't work, need larger (higher indices) + lower_idx = mid_idx + 1 + + return best_working_depth, iterations + + def _binary_search_srl_depth( + self, + node_idx: int, + fifo_idx: int, + current_depths: list, + bitwidth: int, + initial_fifo_depths: list[dict[str, list[int]]], + sim: NodeConnectedSimulation, + sim_cycles: float, + fifo_first_valid_cycles: list[list[int]], + lower_luts: int, + upper_luts: int, + ) -> tuple[int, int]: + """Perform binary search to find minimal working FIFO depth in LUTRAM range. + + Args: + node_idx: Node index + fifo_idx: FIFO index within node + current_depths: Current working FIFO depth configuration. FIFOs already + minimised in this pass carry their final depth; this list must not be + modified directly (``_test_depth`` deep-copies it before trial edits). + bitwidth: Data bitwidth + initial_fifo_depths: Baseline performance data + sim: Simulation controller + sim_cycles: Maximum simulation cycles + fifo_first_valid_cycles: First valid cycle for each FIFO + lower_luts: Lower bound for LUT count + upper_luts: Upper bound for LUT count (known to work) + + Returns: + Tuple: Best working depth found, Number of Iterations required to arrive at this result + """ + iterations = 0 + _, max_d = calculate_srl16e_depth_range(upper_luts, bitwidth) + best_working_depth = max_d + + while lower_luts < upper_luts: + mid_luts = (lower_luts + upper_luts) // 2 + + # Prevent infinite loop + if mid_luts == upper_luts: + mid_luts = upper_luts - 1 + if mid_luts < lower_luts: + break + + # Find valid depth for this LUT count + _, max_d = calculate_srl16e_depth_range(mid_luts, bitwidth) + + if max_d == 0: + # No valid configuration, try more LUTs + lower_luts = mid_luts + 1 + continue + + success, _ = self._test_depth( + max_d, + node_idx, + fifo_idx, + current_depths, + initial_fifo_depths, + sim, + sim_cycles, + fifo_first_valid_cycles, + ) + iterations += 1 + + if success: + # This depth works, try smaller + best_working_depth = max_d + upper_luts = mid_luts + else: + # This depth doesn't work, need larger + lower_luts = mid_luts + 1 + + return best_working_depth, iterations + + def _needs_minimization(self, fifo_depth: int, bitwidth: int) -> bool: + """Determine whether a FIFO can be minimized further. + + Args: + fifo_depth: Current FIFO depth + bitwidth: Data bitwidth + + Returns: + True if the FIFO can be minimized further, False otherwise. + """ + # Qsrl FIFO Formula: LUTs = ⌈depth/32⌉ x ⌈bitwidth/2⌉ + if fifo_depth <= 32: # FIFOs of depth <=32 fit into bitwidth/2 LUTs + return False + # Return False if exactly the minimum number of possible BRAM blocks is used for this + # bitwidth and depth is sufficiently large that further optimization is unlikely to succeed + return not ( + calculate_bram_blocks(fifo_depth, bitwidth) + <= self._get_valid_block_counts(1, bitwidth, bitwidth)[0] + and fifo_depth > math.floor(self.max_qsrl_depth * 1.1) + ) + + +def calculate_bram_blocks(depth: int, bitwidth: int) -> int: + """Calculate the number of BRAM blocks required for a BRAM FIFO. + + Args: + depth: FIFO depth + bitwidth: Data bitwidth + """ + if bitwidth == 1: + return math.ceil(depth / 16384) + if bitwidth == 2: + return math.ceil(depth / 8192) + if bitwidth <= 4: + return (math.ceil(depth / 4096)) * (math.ceil(bitwidth / 4)) + if bitwidth <= 9: + return (math.ceil(depth / 2048)) * (math.ceil(bitwidth / 9)) + if bitwidth <= 18 or depth > 512: + return (math.ceil(depth / 1024)) * (math.ceil(bitwidth / 18)) + return (math.ceil(depth / 512)) * (math.ceil(bitwidth / 36)) + + +def calculate_bram_depth_range(blocks: int, bitwidth: int) -> tuple[int, int]: + """Calculate the range of FIFO depths that use exactly the given number of BRAM blocks. + + Args: + blocks: Number of BRAM blocks + bitwidth: Data bitwidth + + Returns: + Tuple of (min_depth, max_depth) that uses exactly 'blocks' BRAM blocks. + """ + if blocks < 1: + raise FINNInternalError("Number of BRAM blocks must be at least 1") + + # Invert the formula from calculate_bram_blocks based on bitwidth + if bitwidth == 1: + # blocks = ⌈depth/16384⌉ + # Inversion: (blocks-1)*16384 < depth ≤ blocks*16384 + min_depth = (blocks - 1) * 16384 + 1 if blocks > 1 else 1 + max_depth = blocks * 16384 + elif bitwidth == 2: + # blocks = ⌈depth/8192⌉ + # Inversion: (blocks-1)*8192 < depth ≤ blocks*8192 + min_depth = (blocks - 1) * 8192 + 1 if blocks > 1 else 1 + max_depth = blocks * 8192 + elif bitwidth <= 4: + # blocks = ⌈depth/4096⌉ * ⌈bitwidth/4⌉ + bitwidth_factor = math.ceil(bitwidth / 4) + depth_blocks = math.ceil(blocks / bitwidth_factor) + min_depth = (depth_blocks - 1) * 4096 + 1 if depth_blocks > 1 else 1 + max_depth = depth_blocks * 4096 + elif bitwidth <= 9: + # blocks = ⌈depth/2048⌉ * ⌈bitwidth/9⌉ + bitwidth_factor = math.ceil(bitwidth / 9) + depth_blocks = math.ceil(blocks / bitwidth_factor) + min_depth = (depth_blocks - 1) * 2048 + 1 if depth_blocks > 1 else 1 + max_depth = depth_blocks * 2048 + elif bitwidth <= 18: + # blocks = ⌈depth/1024⌉ * ⌈bitwidth/18⌉ + bitwidth_factor = math.ceil(bitwidth / 18) + depth_blocks = math.ceil(blocks / bitwidth_factor) + min_depth = (depth_blocks - 1) * 1024 + 1 + max_depth = depth_blocks * 1024 + else: + # bitwidth > 18, split into two cases from original function + # Case 1: depth > 512 uses ⌈depth/1024⌉ * ⌈bitwidth/18⌉ + # Case 2: depth ≤ 512 uses ⌈depth/512⌉ * ⌈bitwidth/36⌉ + + # Try the depth > 512 case first (⌈depth/1024⌉ * ⌈bitwidth/18⌉) + bitwidth_factor = math.ceil(bitwidth / 18) + depth_blocks = math.ceil(blocks / bitwidth_factor) + + # Check if blocks is achievable with this bitwidth factor + if blocks % bitwidth_factor != 0 or depth_blocks < 1: + # Try the depth ≤ 512 case instead + pass + else: + min_depth = max((depth_blocks - 1) * 1024 + 1, 513) # Must be > 512 + max_depth = depth_blocks * 1024 + # Check if this range is valid (entirely > 512) + if min_depth > 512 and calculate_bram_blocks(min_depth, bitwidth) == blocks: + return (min_depth, max_depth) + + # Try the depth ≤ 512 case (⌈depth/512⌉ * ⌈bitwidth/36⌉) + bitwidth_factor = math.ceil(bitwidth / 36) + depth_blocks = math.ceil(blocks / bitwidth_factor) + + # Check if blocks is achievable with this bitwidth factor + if blocks % bitwidth_factor != 0 or depth_blocks < 1: + return (0, 0) # Invalid block count for this bitwidth + + min_depth = (depth_blocks - 1) * 512 + 1 if depth_blocks > 1 else 1 + max_depth = min(depth_blocks * 512, 512) # Must be ≤ 512 + + # Verify the range is valid (entirely ≤ 512 and produces correct block count) + if max_depth <= 512 and calculate_bram_blocks(min_depth, bitwidth) == blocks: + return (min_depth, max_depth) + + return (0, 0) # No valid range found + + # Verify the range is valid + if calculate_bram_blocks(min_depth, bitwidth) != blocks: + raise FINNInternalError("Calculated BRAM depth range is invalid!") + return (min_depth, max_depth) + + +def calculate_uram_blocks(depth: int, bitwidth: int) -> int: + """Calculate the number of URAM blocks required for a URAM FIFO. + + Args: + depth: FIFO depth + bitwidth: Data bitwidth + """ + return (math.ceil(depth / 4096)) * (math.ceil(bitwidth / 72)) + + +def calculate_uram_depth_range(blocks: int, bitwidth: int) -> tuple[int, int]: + """Calculate the range of FIFO depths that use exactly the given number of URAM blocks. + + Args: + blocks: Number of URAM blocks + bitwidth: Data bitwidth + + Returns: + Tuple of (min_depth, max_depth) that uses exactly 'blocks' URAM blocks. + Returns (0, 0) if no valid range exists. + """ + if blocks < 1: + return (0, 0) + + # URAM formula: blocks = ⌈depth/4096⌉ * ⌈bitwidth/72⌉ + bitwidth_factor = math.ceil(bitwidth / 72) + + # Calculate depth range + # Minimum depth: (blocks / bitwidth_factor - 1) * 4096 + 1 + # Maximum depth: (blocks / bitwidth_factor) * 4096 + + if blocks % bitwidth_factor != 0: + return (0, 0) # Invalid block count for this bitwidth + + depth_blocks = blocks // bitwidth_factor + min_depth = (depth_blocks - 1) * 4096 + 1 if depth_blocks > 1 else 1 + max_depth = depth_blocks * 4096 + + # Verify + if calculate_uram_blocks(min_depth, bitwidth) != blocks: + return (0, 0) + + return (min_depth, max_depth) + + +def calculate_srl16e_luts(depth: int, bitwidth: int) -> int: + """Calculate the number of SRL16E LUTs required for a FIFO. + + Args: + depth: FIFO depth (must be >= 2) + bitwidth: Data bitwidth + + Returns: + Number of SRL16E LUTs required without adress LUTs. + + Formula: LUTs = ⌈depth/32⌉ x ⌈bitwidth/2⌉ + """ + ram_luts = (math.ceil(depth / 32)) * (math.ceil(bitwidth / 2)) + return ram_luts + + +def calculate_srl16e_depth_range(luts: int, bitwidth: int) -> tuple[int, int]: + """Calculate the range of FIFO depths that use exactly the given number of SRL16E LUTs. + + Args: + luts: Number of SRL16E LUTs + bitwidth: Data bitwidth + + Returns: + Tuple of (min_depth, max_depth) that uses exactly 'luts' LUTs. + Returns (0, 0) if no valid range exists. + """ + if luts < 1: + return (0, 0) + + # SRL16E formula: luts = ⌈depth/32⌉ * ⌈bitwidth/2⌉ + bitwidth_factor = math.ceil(bitwidth / 2) + + # Calculate depth range + if luts % bitwidth_factor != 0: + return (0, 0) # Invalid LUT count for this bitwidth + + depth_blocks = luts // bitwidth_factor + min_depth = (depth_blocks - 1) * 32 + 1 if depth_blocks > 1 else 2 + max_depth = depth_blocks * 32 + + # Verify + if calculate_srl16e_luts(min_depth, bitwidth) != luts: + return (0, 0) + + return (min_depth, max_depth) diff --git a/src/finn/transformation/fpgadataflow/simulation_controller.py b/src/finn/transformation/fpgadataflow/simulation_controller.py new file mode 100644 index 0000000000..feaff91bad --- /dev/null +++ b/src/finn/transformation/fpgadataflow/simulation_controller.py @@ -0,0 +1,340 @@ +"""Control (node based) simulations via unix sockets.""" + +import json +import os +import socket +import subprocess +import threading +import time +from pathlib import Path +from rich.console import Console +from threading import Lock +from typing import Any + +from finn.util.basic import make_build_dir +from finn.util.exception import FINNInternalError +from finn.util.logging import ThreadsafeProgressDisplay + + +class SimulationController: + """Control a node-node IPC connected simulation in threads.""" + + def __init__( + self, + parallel_simulations: int, + names: list[str], + binaries: list[Path], + console: Console, + poll_interval: float = 1.0, + with_progressbar: bool = True, + ) -> None: + """Create a new controller, without starting the simulation. + + Args: + parallel_simulations: Number of simulations to run in parallel. + names: List of names for the simulations. + binaries: List of paths to the simulation binaries. + console: The rich.console.Console to print with. + poll_interval: How long the wait between checks of the processes stdout/stdin is. + with_progressbar: Whether or not to display a progressbar for the cycle count. + """ + if len(names) != len(binaries): + raise FINNInternalError( + f"Simulation controller received non-matching " + f"name and binary count: {len(names)} and {len(binaries)}" + ) + self.binaries = binaries + self.names = names + self.console = console + self.poll_interval = poll_interval + self.workers = parallel_simulations + self.progress = None + if with_progressbar: + self.progress = ThreadsafeProgressDisplay(names, [0] * len(names), names) + self.running_lock = Lock() + self.running = 0 + self.total = len(names) + self.logdir = Path(make_build_dir("simulation_logfiles_")) + + # Socket communication management + self.processes: list[tuple[subprocess.Popen, Any, Any]] = [] + self.sockets: list[tuple[socket.socket, str]] = [] + + # Early termination flag + self.should_stop = False + self.stop_lock = Lock() + + def _start_process(self, binary: Path, process_id: int, cpu: int = -1) -> int: + """Start a single C++ simulation process with its own Unix socket. + + Args: + binary: Path to the simulation executable + process_id: Unique identifier for this process + cpu: CPU core to bind to (if -1, no binding) + + Returns: + Index of the started process + """ + thread_id = threading.get_ident() + + # Create unique socket path which includes thread ID to avoid conflicts + # with multiple threads + socket_path = Path(f"/tmp/fifosim_sockets/{thread_id}/") + socket_path.mkdir(parents=True, exist_ok=True) + socket_path = socket_path / f"sim_socket_{process_id}.sock" + + # Remove socket if it exists + if socket_path.exists(): + socket_path.unlink() + + # Build command arguments + cmd = [str(binary), "--socket", socket_path] + + # Create log files for stdout and stderr + stdout_log = self.logdir / f"{process_id}_stdout_cpp.log" + stderr_log = self.logdir / f"{process_id}_stderr_cpp.log" + + stdout_file = stdout_log.open("w") + stderr_file = stderr_log.open("w") + + # Start C++ process - redirect stdout/stderr to files + cwd = binary.parent + # Set CPU affinity if a specific core is requested + preexec_fn = (lambda: os.sched_setaffinity(0, {cpu})) if cpu != -1 else None + proc = subprocess.Popen( + cmd, stdout=stdout_file, stderr=stderr_file, text=True, cwd=cwd, preexec_fn=preexec_fn + ) + + # Check if process started successfully + time.sleep(0.2) # Give process time to fail if there's an immediate error + if proc.poll() is not None: + stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr" + stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout" + stdout_file.close() + stderr_file.close() + msg = ( + f"C++ process exited immediately with code {proc.returncode}\n" + f"Stderr: {stderr_output}\nStdout: {stdout_output}" + ) + self.console.log(str(process_id) + ": " + msg) + raise RuntimeError(msg) + + # Create Unix socket and connect + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + + # Wait for C++ process to create socket (with timeout) + max_retries = 100 # 20 seconds total + connected = False + for i in range(max_retries): + # Check if process is still alive + if proc.poll() is not None: + stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr" + stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout" + stdout_file.close() + stderr_file.close() + msg = ( + f"C++ process died during socket wait with code {proc.returncode}\n" + f"Stderr: {stderr_output}\nStdout: {stdout_output}" + ) + self.console.log(str(process_id) + ": " + msg) + raise RuntimeError(msg) + + try: + sock.connect(str(socket_path)) + connected = True + break + except (FileNotFoundError, ConnectionRefusedError) as e: + if i == max_retries - 1: + stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr" + stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout" + stdout_file.close() + stderr_file.close() + msg = ( + f"Failed to connect to socket after {max_retries} retries\n" + f"Stderr: {stderr_output}\nStdout: {stdout_output}" + ) + self.console.log(str(process_id) + ": " + msg) + raise RuntimeError(msg) from e + time.sleep(0.2) + + if not connected: + stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr" + stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout" + stdout_file.close() + stderr_file.close() + msg = ( + f"Failed to connect to socket {socket_path}\n" + f"Stderr: {stderr_output}\nStdout: {stdout_output}" + ) + self.console.log(str(process_id) + ": " + msg) + raise RuntimeError(msg) + + self.processes.append((proc, stdout_file, stderr_file)) + self.sockets.append((sock, str(socket_path))) + return len(self.processes) - 1 + + def _send_command(self, process_idx: int, command: str, payload: dict[str, Any]) -> None: + """Send command and payload to a specific process. + + Args: + process_idx: Index of the process to send to + command: Command string (e.g., "start", "status", "stop") + payload: Dictionary containing command-specific data + """ + sock, _ = self.sockets[process_idx] + + message = {"command": command, "payload": payload} + + # Send length-prefixed message + msg_str = json.dumps(message) + msg_bytes = msg_str.encode("utf-8") + length = len(msg_bytes) + + # Send 4-byte length prefix (little-endian) + sock.sendall(length.to_bytes(4, byteorder="little")) + # Send actual message + sock.sendall(msg_bytes) + + def _receive_response(self, process_idx: int) -> dict[str, Any] | None: + """Receive response from a specific process. + + Args: + process_idx: Index of the process to receive from + + Returns: + Dictionary containing the response, or None if error + + Raises: + TimeoutError: If socket times out waiting for response + """ + sock, _ = self.sockets[process_idx] + + # Set 120 second timeout to prevent deadlocks + # Needs to be rather larger to give the simulation IO thread time to answer + sock.settimeout(120.0) + + # Read 4-byte length prefix + length_bytes = sock.recv(4) + if not length_bytes: + self.console.log(f"{process_idx}: Client disconnected.") + return None + + length = int.from_bytes(length_bytes, byteorder="little") + + # Read message data + msg_bytes = b"" + while len(msg_bytes) < length: + chunk = sock.recv(length - len(msg_bytes)) + if not chunk: + break + msg_bytes += chunk + + return json.loads(msg_bytes.decode("utf-8")) + + def _send_and_receive( + self, process_idx: int, command: str, payload: dict[str, Any] + ) -> dict[str, Any] | None: + """Send command and wait for response (convenience method). + + Args: + process_idx: Index of the process + command: Command string + payload: Command payload + + Returns: + Response dictionary + + Raises: + RuntimeError: If the subprocess has terminated with an error + """ + try: + self._send_command(process_idx, command, payload) + response = self._receive_response(process_idx) + + # If we got None (timeout or connection error), check if process crashed + if response is None: + proc, stdout_file, stderr_file = self.processes[process_idx] + returncode = proc.poll() + + if returncode is not None and returncode != 0: + # Process has terminated with an error + # Flush and read error logs + stdout_file.flush() + stderr_file.flush() + + stdout_log = self.logdir / f"{process_idx}_stdout_cpp.log" + stderr_log = self.logdir / f"{process_idx}_stderr_cpp.log" + + stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr" + stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout" + + # Raise the actual error from the subprocess + msg = ( + f"Subprocess (process_idx={process_idx}) terminated with" + f" exit code {returncode}.\n" + f"Stderr:\n{stderr_output}\n" + f"Stdout:\n{stdout_output}" + ) + raise RuntimeError(msg) from None + + return response + except (BrokenPipeError, ConnectionResetError, TimeoutError) as err: + # Connection error or timeout means the subprocess may have died + # Check if it exited with an error and raise that instead + proc, stdout_file, stderr_file = self.processes[process_idx] + returncode = proc.poll() + + if returncode is not None and returncode != 0: + # Process has terminated with an error + # Flush and read error logs + stdout_file.flush() + stderr_file.flush() + + stdout_log = self.logdir / f"{process_idx}_stdout_cpp.log" + stderr_log = self.logdir / f"{process_idx}_stderr_cpp.log" + + stderr_output = stderr_log.read_text() if stderr_log.exists() else "No stderr" + stdout_output = stdout_log.read_text() if stdout_log.exists() else "No stdout" + + # Raise the actual error from the subprocess + msg = ( + f"Subprocess (process_idx={process_idx}) terminated with" + f" exit code {returncode}.\n" + f"Stderr:\n{stderr_output}\n" + f"Stdout:\n{stdout_output}" + ) + raise RuntimeError(msg) from err # from None + + # If process exited cleanly (returncode == 0) or hasn't exited yet, + # this is an unexpected connection error + return None + + def _cleanup_sockets(self) -> None: + """Close all sockets and terminate all processes.""" + # Send stop command to all processes + errors = [] + for i in range(len(self.processes)): + try: + self._send_command(i, "stop", {}) + self._receive_response(i) + except Exception as e: # noqa + errors.append((i, e)) + + # Close sockets + for sock, socket_path in self.sockets: + sock.close() + socket_path_obj = Path(socket_path) + if socket_path_obj.exists(): + socket_path_obj.unlink(True) + + # Terminate processes and close file handles + for proc, stdout_file, stderr_file in self.processes: + proc.terminate() + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + finally: + stdout_file.close() + stderr_file.close() diff --git a/src/finn/transformation/fpgadataflow/simulation_isolated.py b/src/finn/transformation/fpgadataflow/simulation_isolated.py new file mode 100644 index 0000000000..a597918c45 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/simulation_isolated.py @@ -0,0 +1,642 @@ +"""Simulating layers on their own to observe their behaviour.""" +import io +import json +import pandas as pd +import re +import time +from collections.abc import Callable +from concurrent.futures import Future, ThreadPoolExecutor +from pathlib import Path, PosixPath, PurePath +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.base import Transformation +from rich.console import Console +from threading import Lock +from typing import Any, Literal, TypeAlias + +from finn.transformation.fpgadataflow.simulation import Simulation, store_fifo_data +from finn.transformation.fpgadataflow.simulation_build import SimulationType +from finn.transformation.fpgadataflow.simulation_controller import SimulationController +from finn.util.exception import FINNInternalError +from finn.util.logging import log + + +def get_time() -> str: + """Return the current time in a formatted hour:minutes:second string.""" + return f"[{time.strftime('%H:%M:%S')}]" + + +class NodeIsolatedSimulationController(SimulationController): + """Run simulations for node isolated cases.""" + + IsolatedSimLogData = dict[Literal["ready", "valid"], list[dict[str, int]]] + + def __init__( + self, + parallel_simulations: int, + names: list[str], + binaries: list[Path], + console: Console, + poll_interval: float = 1.0, + with_progressbar: bool = False, + ) -> None: + """Set up node isolated simulation.""" + super().__init__( + parallel_simulations, names, binaries, console, poll_interval, with_progressbar + ) + log.info("Started simulation controller") + + def get_logfile_path(self, binary_or_idx: Path | int) -> Path: + """Get the logfile for the given binary or process index.""" + if type(binary_or_idx) is int: + return ( + self.logdir / f"{binary_or_idx}_log_isolated_" + f"{self.names[binary_or_idx]}_python.txt" + ) + elif type(binary_or_idx) in [Path, PurePath, PosixPath]: # noqa + process_idx = self.binaries.index(binary_or_idx) # type: ignore + return self.logdir / f"{process_idx}_log_isolated_{self.names[process_idx]}_python.txt" + raise TypeError("Pass either a simulation binary path of an index") + + def write_log(self, logfile: io.TextIOWrapper, msg: str, flush: bool = True) -> None: + """Write a timestamped message to log.""" + logfile.write(f"{get_time()} {msg}\n") + if flush: + logfile.flush() + + def collect_results( + self, d: Path, readylog_name: str = "readylog.txt", validlog_name: str = "validlog.txt" + ) -> IsolatedSimLogData: + """Recieve the directory containing a binary and the simulation logs. + If no logs are found raises an error, otherwise return the postprocessed logs + read from JSON. + """ + readylog = d / readylog_name + validlog = d / validlog_name + if not readylog.exists() or not validlog.exists(): + raise FINNInternalError(f"Could not find simulation logs at {readylog} and {validlog}") + return { + "ready": json.loads(readylog.read_text()), + "valid": json.loads(validlog.read_text()), + } + + def run(self) -> dict[str, IsolatedSimLogData]: + """Run a node isolated simulation and return the collected + input ready / output valid data, indexed based on node names.""" + futures: list[Future] = [] + datalock = Lock() + total = len(self.binaries) + done = 0 + + # Important to initialize from names. Otherwise the results are added into the dict + # in the order in which they finished simulating. But we want to keep the model order. + data: dict[str, self.IsolatedSimLogData] = {name: {} for name in self.names} + + # TODO: Lock not needed; futures are not consumed just by + # TODO: using the callback, so we can unpack them later + + # Callback to show progress and save the simulation result + def _done_callback_generator(name: str) -> Callable: + nonlocal total, done, data, datalock + + def _f(future: Future) -> None: + nonlocal total, done, data, datalock + with datalock: + done += 1 + log.info( + f"[ [bold green]{int(100 * float(done)/float(total))}%" + f"[/bold green] ] {name} done!", + extra={"markup": True, "highlighter": None}, + ) + data[name] = future.result() + + return _f + + # Running the simulation threads + assert len(self.names) == len(self.binaries) + with self.console.status(f"Running simulation on every node. Log directory: {self.logdir}"): + start = time.time() + with ThreadPoolExecutor(len(self.binaries)) as tpe: + for i, binary in enumerate(self.binaries): + futures.append(tpe.submit(self._run_binary, binary)) + futures[-1].add_done_callback(_done_callback_generator(self.names[i])) + tpe.shutdown(wait=True) + elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start)) + log.info("Thread pool closed. Closing sockets and postprocessing data") + log.info(f"Simulations took {elapsed}") + + # Finish the logs and clean up the sockets + for binary in self.binaries: + with self.get_logfile_path(binary).open("a") as logfile: + self.write_log(logfile, "Cleaning up socket.") + self._cleanup_sockets() + + # Check for invalid data points + invalid = [] + for i, name in enumerate(data.keys()): + if data[name] is None: + invalid.append((name, i)) + if len(invalid) > 0: + raise FINNInternalError( + f"Lost connection / malformed response from nodes: " + f"{', '.join([str(x) for x in invalid])}" + ) + return data + + def _run_binary(self, binary: Path) -> IsolatedSimLogData | None: + """Thread routine: Run a single simulation from the given path and return + the collected results. Returns None if connection is lost.""" + process_index = self.binaries.index(binary) + with self.get_logfile_path(binary).open("w+") as logfile: + # Logging helper + def write_log(msg: str) -> None: + self.write_log(logfile, msg) + + # Initialize: Start simulation process and give the start command + write_log("Initializing simulation") + write_log(f"Binary is: {binary}") + proc_idx = self._start_process(binary, process_index) + response = self._send_and_receive(proc_idx, "start", {}) + if response is None: + write_log( + "No answer for the clients 'start' " "command received. Timeout or disconnect." + ) + return None + write_log(f"Start response: {response}") + + # Main loop + write_log("Beginning main loop") + logfile.flush() + total_status_requests = 0 + while True: + # Request status in regular intervals + time.sleep(self.poll_interval) + write_log("Sending status request") + response = self._send_and_receive(proc_idx, "status", {}) + total_status_requests += 1 + write_log(f"Status request {total_status_requests} sent.") + + # Process response + if response is None: + write_log("Status request answered with None: Timeout or connection lost.") + return None + state = response["state"] + write_log(f"Received answer for status request ({total_status_requests})") + + # If the simulation is done, postprocess and return the collected data + if state == "done": + write_log("Received done status. Sending stop signal to simulation.") + resp = self._send_and_receive(proc_idx, "stop", {}) + if resp is None: + write_log("No stop response received.") + else: + write_log("Stop successfully received.") + return self.collect_results(binary.parent) + + # Otherwise log the current status + # TODO: Field name - meaning wrong? + in_done = response["inputCyclesDone"] + in_target = response["inputCyclesTarget"] + out_done = response["outputCyclesDone"] + out_target = response["outputCyclesTarget"] + total_cycles = response["totalCycles"] + percent_simulated_input = int(100.0 * float(in_done) / float(in_target)) + percent_simulated_output = int(100.0 * float(out_done) / float(out_target)) + write_log("Status response:") + write_log(f"\tTotal cycles: {total_cycles}") + write_log( + f"\tInput data simulated: {percent_simulated_input}% " + f"({in_done} / {in_target})" + ) + write_log( + f"\tOutput data simulated: {percent_simulated_output}% " + f"({out_done} / {out_target})" + ) + + +FIFODepthConfig: TypeAlias = dict[int, dict[str, str | list[int]]] +IsoSimLogData = NodeIsolatedSimulationController.IsolatedSimLogData +IsoSimLogDataByLayer = dict[str, IsoSimLogData] # Indexed by layer name + + +class IsolatedSimulation(Simulation): + def __init__( + self, + model: ModelWrapper, + simulation_type: SimulationType, + fpgapart: str, + clk_ns: float, + functional_sim: bool, + workers: int | None = None, + ) -> None: + super().__init__(model, simulation_type, fpgapart, clk_ns, functional_sim, workers) + + def simulate(self) -> IsoSimLogDataByLayer: + """Simulate isolated nodes.""" + if self.simulation_type != SimulationType.NODE_BASED_ISOLATED: + raise FINNInternalError( + f"Called simulation function 'simulate_node_isolated' " + f"does not match provided simulation type " + f"{self.simulation_type}" + ) + names = [node.name for node in self.model.graph.node] + console = Console() + controller = NodeIsolatedSimulationController( + len(self.binaries), names, list(self.binaries.values()), console, 0.1, False + ) + return controller.run() + + +class RunLayerIsolatedSimulation(Transformation): + """Run a layer isolated simulation and calculate some information for a + later layer parallel simulation. + + This modifies or creates a pandas DF and stores it in a csv file. This file can be + modified by the node connected simulation as well.""" + + def __init__( + self, fpgapart: str, clk_ns: float, functional_sim: bool, output_dir: Path + ) -> None: + """Run isolated layer simulations. The + default location is at cfg.output_dir/report/fifo_data.csv.""" + super().__init__() + self.fpgapart = fpgapart + self.clk_ns = clk_ns + self.functional_sim = functional_sim + self.output_dir = output_dir + + # Read / create dataframe with default path + self.default_fifo_data_path = self.output_dir / "report" / "fifo_data.csv" + + def calculate_upper_bounds(self, data: IsoSimLogDataByLayer) -> dict[str, dict[str, int]]: + """Try to calculate an upper bound for the incoming FIFO size of the layers. + Return size indexed by layer name and stream name. + + >>> step = RunLayerIsolatedSimulation("", 0.0, False) + >>> bounds = step.calculate_upper_bounds({ + ... "A": { + ... "ready": [ + ... {"totalCycles": 43, "inputCyclesDone": 12, + ... "inputCyclesTarget": 24, "s_axi_0": 1, "s_axi_1": 0}, + ... {"totalCycles": 44, "inputCyclesDone": 13, + ... "inputCyclesTarget": 24, "s_axi_0": 0, "s_axi_1": 0}, + ... ], "valid": [] + ... }, + ... "B": { + ... "ready": [ + ... {"totalCycles": 100, "inputCyclesDone": 3, + ... "inputCyclesTarget": 10, "s_axi_0": 1, "s_axi_1": 1, + ... "s_axi_2": 0}, + ... ], "valid": [] + ... }, + ... "C": { + ... "ready": [ + ... {"totalCycles": 43, "inputCyclesDone": 14, + ... "inputCyclesTarget": 24, "s_axi_0": 1, "s_axi_1": 0}, + ... {"totalCycles": 44, "inputCyclesDone": 15, + ... "inputCyclesTarget": 24, "s_axi_0": 0, "s_axi_1": 0}, + ... ], "valid": [] + ... } + ... }) + >>> bounds["A"] + {'s_axi_0': 1, 's_axi_1': 2} + >>> bounds["B"] + {'s_axi_0': 0, 's_axi_1': 0, 's_axi_2': 1} + >>> bounds["C"] + {'s_axi_0': 0, 's_axi_1': 0} + """ + + # TODO: Proper pytest tests + def _any_ready(cycle_data: dict[str, int]) -> bool: + for key in cycle_data.keys(): + if ( + key not in ["totalCycles", "inputCyclesDone", "inputCyclesTarget"] + and cycle_data[key] == 1 + ): + return True + return False + + results: dict[str, dict[str, int]] = {} + for layer in data.keys(): + # Save all keys that are not + results[layer] = { + stream_name: 0 + for stream_name in data[layer]["ready"][0].keys() + if stream_name not in ["inputCyclesDone", "inputCyclesTarget", "totalCycles"] + } + for cycle_data_ready, cycle_data_valid in zip( + data[layer]["ready"], data[layer]["valid"], strict=True + ): + if cycle_data_ready["inputCyclesDone"] > int( + cycle_data_ready["inputCyclesTarget"] / 2.0 + ) and cycle_data_valid["outputCyclesDone"] > int( + cycle_data_valid["outputCyclesTarget"] / 2.0 + ): + break + for stream_name in results[layer].keys(): + # TODO: Currently on the C++ side we multiply the + # TODO: target cycles by 2, to get two samples + # TODO: We keep track of ready signals until we see + # TODO: the first ready after half of all cycles were seen. + # TODO: This might change in the future + if ( + cycle_data_ready["inputCyclesTarget"] % 2 != 0 + or cycle_data_valid["outputCyclesTarget"] % 2 != 0 + ): + raise FINNInternalError( + f"An 'inputCyclesTarget' / 'outputCyclesTarget' of layer {layer} seems " + f"to not be an even number. Currently, we double " + f"the target simulation cycles for every layer " + f"on the C++ side. This error may point towards " + f"a change on the C++ side, which may cause the " + f"need to update this function accordingly!" + ) + results[layer][stream_name] += int(cycle_data_ready[stream_name] == 0) + + # TODO: This calculation assumes, that if the producer does NOT fire the entire time, + # TODO: the consumer can read at least at the same speed as + # if the producer did, and not slower. + # TODO: (Since this would mean that less data pressure from + # the producer makes the consumer _slower_.) + # TODO: This should usually be the case, but is important to keep in mind. + return results + + def sanity_check_logged_data(self, data: IsoSimLogDataByLayer) -> None: + """Do checks on the returned data to make sure it is in spec. + + A correctly formatted example would be: + >>> data = { + ... "layer1": { + ... "ready": [{"totalCycles": 10, "inputCyclesDone": 5, + ... "inputCyclesTarget": 10, "s_axi_0": 1}], + ... "valid": [{"totalCycles": 10, "outputCyclesDone": 5, + ... "outputCyclesTarget": 10, "m_axi_0": 1}] + ... } + ... } + >>> sim = RunLayerIsolatedSimulation("", 0.0, False) + >>> sim.sanity_check_logged_data(data) + >>> + """ + # 0. Valid and ready are present + for layer, ldata in data.items(): + if "valid" not in ldata.keys(): + raise FINNInternalError( + f"Simulation log data of layer {layer} is missing the VALID log." + ) + if "ready" not in ldata.keys(): + raise FINNInternalError( + f"Simulation log data of layer {layer} is missing the READY log." + ) + # 1. All cycle datas are uniform and have at least one stream signal + for i, (layer, ldata) in enumerate(data.items()): + cycle_data = ldata["ready"] + ldata["valid"] + lengths: set[int] = {len(cycle.keys()) for cycle in cycle_data} + if len(lengths) != 1: + raise FINNInternalError( + f"Simulation log data inconsistent for layer " + f"{layer} ({i}). Differing number of fields per cycle." + ) + if next(iter(lengths)) < 4: + raise FINNInternalError( + f"Simulation for layer {layer} must contain " + f"atleast 4 fields (total cycles, AXI cycles " + f"done, AXI cycles target and at least one AXI " + f"ready/valid signal)!" + ) + # 2. All ready logs contain the required keywords + readykeys = ["inputCyclesDone", "inputCyclesTarget", "totalCycles"] + for rlayer, rdata in data.items(): + for cycle in rdata["ready"]: + if any(keyword not in cycle.keys() for keyword in readykeys): + raise FINNInternalError( + f"Simulation READY log of layer {rlayer} " + f"contains cycles that are missing a required key." + ) + if any(key not in readykeys and "axi" not in key for key in cycle.keys()): + raise FINNInternalError( + f"In the READY simulation log of layer " + f"{rlayer} there seem to be fields that " + f"are not expected keywords or AXI streams!" + ) + # 3. All valid logs contain the required keywords + validkeys = ["outputCyclesDone", "outputCyclesTarget", "totalCycles"] + for vlayer, vdata in data.items(): + for cycle in vdata["valid"]: + if any(keyword not in cycle.keys() for keyword in validkeys): + raise FINNInternalError( + f"Simulation VALID log of layer {vlayer} " + f"contains cycles that are missing a required key." + ) + if any(key not in validkeys and "axi" not in key for key in cycle.keys()): + raise FINNInternalError( + f"In the VALID simulation log of layer " + f"{vlayer} there seem to be fields that " + f"are not expected keywords or AXI streams!" + ) + # 4. Cycles done can never be larger then the number of total cycles passed in the sim + for layer, cdata in data.items(): + for line in cdata["ready"] + cdata["valid"]: + if ( + "inputCyclesDone" in line.keys() + and line["inputCyclesDone"] > line["totalCycles"] + ): + raise FINNInternalError( + f"Simulation log of layer {layer} looks incorrect: " + f"Number of active receiving cycles " + f"({line['inputCyclesDone']}) larger than number of " + f"total cycles passed ({line['totalCycles']})." + ) + if ( + "outputCyclesDone" in line.keys() + and line["outputCyclesDone"] > line["totalCycles"] + ): + raise FINNInternalError( + f"Simulation log of layer {layer} looks incorrect: " + f"Number of active producing cycles " + f"({line['outputCyclesDone']}) larger than number of " + f"total cycles passed ({line['totalCycles']})." + ) + # 5. Stream keywords can never have any other value than 1 (HIGH) or 0 (LOW) + reserved_keywords = readykeys + validkeys + for layer, ldata in data.items(): + for cycle_data in ldata["ready"] + ldata["valid"]: + for key in cycle_data.keys(): + if key not in reserved_keywords and cycle_data[key] not in [0, 1]: + raise FINNInternalError( + f"Layer {layer} has data point where a " + f"non-reserved field (thus an axi stream " + f"ready/valid signal) is neither 0 nor 1: " + f"Key: {key}, Value: {cycle_data[key]}" + ) + # 6. Data is not empty + for layer, ldata in data.items(): + if len(ldata["ready"]) == 0: + raise FINNInternalError(f"Layer {layer} has no ready data!") + if len(ldata["valid"]) == 0: + raise FINNInternalError(f"Layer {layer} has no valid data!") + # 7. Check that the order of axi streams corresponds to their names. This helps + # somewhat to guarantee that the order always stayed the same from building the simulations + # to evaluating their data + + # The number in the name should increase with every stream, from 0, without gaps + # and streams should be called "s_axis_" + readykeys = ["inputCyclesDone", "inputCyclesTarget", "totalCycles"] + for layer, ldata in data.items(): + for cycledict in ldata["ready"]: + current_stream_idx = 0 + for key in cycledict.keys(): + if key not in readykeys: + m = re.fullmatch(r"^s_axis_(\d+)$", key) + if m is None: + raise FINNInternalError( + f"Layer {layer} has a non-expected key that " + f"does not match the names of streams expected " + f"(s_axis_).\n\tKey is: {key}" + ) + stream_idx = m.group(1) + if int(stream_idx) != current_stream_idx: + raise FINNInternalError( + f"Layer {layer} has non-expected stream key " + f"that does not follow the expected index " + f"scheme: Current expected index is " + f"{current_stream_idx}. Got instead: " + f"{stream_idx}" + ) + current_stream_idx += 1 + # TODO: Check that names match vivado_stitch_ifnames. + # TODO: Currently there is no easy way to do this, since we never save the isolated + # TODO: node-models and vivado_stitch_ifnames is a metadata prop of that isolated model + + def percent_ready(self, data: IsoSimLogDataByLayer) -> dict[str, float]: + """Calculate how many percent of the time the layer was ready for input data. + Return indexed by layer name.""" + # TODO: Implement + return dict.fromkeys(data, 0) + + def apply(self, model: ModelWrapper) -> tuple[ModelWrapper, bool]: + """Run isolated layer simulations.""" + # Run the simulation + sim = IsolatedSimulation( + model, + SimulationType.NODE_BASED_ISOLATED, + self.fpgapart, + self.clk_ns, + self.functional_sim, + ) + data: IsoSimLogDataByLayer = sim.simulate() + + # Check if data looks good + log.info("Checking validity of received simulation data...") + start = time.time() + self.sanity_check_logged_data(data) + log.info(f"Validity check took {time.time() - start} seconds.") + + # Calculate upper bounds + log.info("Estimating upper bounds...") + start = time.time() + in_fifo_upper_bound = self.calculate_upper_bounds(data) + log.info(f"Estimation took {time.time() - start} seconds.") + + # Write into report file + upper_bounds_file = self.output_dir / "report" / "estimate_upper_fifo_bound.json" + upper_bounds_file.write_text(json.dumps(in_fifo_upper_bound, indent=4)) + log.info(f"Wrote results to: {upper_bounds_file}") + + # Save data into dataframe + # NOTE: We actually have to swap the order here: We recorded the _incoming_ FIFO sizes + # However the connected simulation stores the depths on the layers before it, so + # essentially _outgoing_ FIFO sizes. + + # NOTE: For this mapping to work, ordering has to be kept correctly in each step: + # 1. Mapping node.inputs to vivado_stitch_ifnames metadata prop (CreateStitchedIP) + # 2. Mapping IO shapes to ifnames from before (simulation_builder.py) + # 3. Mapping stream_descrs to M/S_AXIS_CONTROL array (C++ simulation creation) + # 4. Writing the data to json. Order of S_AXIS_CONTROL -> order in which JSON gets written + # IMPORTANT: Use nlohmann::ordered_json to keep the insertion order! + # 5. Reading the JSON into python (python dicts are ordered since 3.7) + # According to docs, the Python JSON module also keeps order + # 6. Syncing node.inputs to order of s_axi_... streams read from the JSON. + edited_bounds = {} + + # Fill edited_bounds with empty values + for node in model.graph.node: + suc = model.find_direct_successors(node) + if suc is None: + edited_bounds[node.name] = [-1] + else: + edited_bounds[node.name] = [-1] * len(suc) + + # For every node check its predecessors. + # Find the index/tensor that connects the predecessor and the current one + # Use that index to retrieve the fifo depth between them and save it + def get_index(a: Any, values: Any) -> int | None: + for i, val in enumerate(values): + if val == a: + return i + return None + + for node in model.graph.node: + # Rely on the fact that find_direct_predecessors gives the streams in-order + predecessors = model.find_direct_predecessors(node) + if predecessors is None: + continue + for predecessor in predecessors: + # Find out which m_axis stream of the predecessor leads to node + for producer_idx, pre_out in enumerate(predecessor.output): + if pre_out in node.input: + consumer_idx = get_index(pre_out, node.input) + if consumer_idx is None: + raise FINNInternalError( + f"Could not find index of " + f"{predecessor.name}'s output and " + f"{node.name}'s input: {pre_out}. " + f"Index in predecessor.output is " + f"{producer_idx}" + ) + # TODO: Switch to array instead of dict? + # We have to conver the string-key (s_axi_...) into the index of the dict + key = list(in_fifo_upper_bound[node.name].keys())[consumer_idx] + # TODO: Tests + edited_bounds[predecessor.name][producer_idx] = in_fifo_upper_bound[ + node.name + ][ + key + ] # noqa + log.info( + f"Incoming FIFO {node.name}[{key}/{consumer_idx}] " + f"-> outgoing FIFO {predecessor.name}[{producer_idx}]" + ) + + # Prepare the data + df_data = { + "onnx_index": [], + "node": [], + "stream": [], + "out_fifo_upper_bound": [], + "input_ready_percent": [], + } + for layer, layerdata in edited_bounds.items(): + for idx in range(len(layerdata)): + df_data["onnx_index"].append([n.name for n in model.graph.node].index(layer)) + df_data["node"].append(layer) + df_data["stream"].append(idx) + df_data["out_fifo_upper_bound"].append(layerdata[idx]) + # TODO: Remove input_ready_percent? + # df_data["input_ready_percent"].append(self.percent_ready(data)[layer]) + df_data["input_ready_percent"].append(0.0) + + # Create the DF + self.fifo_data = pd.DataFrame(df_data) + log.info("First few entries of collected data:") + log.info(str(self.fifo_data)) + + # Save in dataframe and model + model = store_fifo_data( + model, + self.fifo_data, + self.default_fifo_data_path, + delete_existing=True, + store_html=True, + ) + + # TODO: Integrate data into the layer parallel simulation + return model, False diff --git a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py index 966bff3d65..e7c88a7766 100644 --- a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py +++ b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py @@ -39,8 +39,7 @@ def _get_signed_from_upstream(model, trunc_node): - """ - Find out what the sign of the input to the trunc node is, + """Find out what the sign of the input to the trunc node is, by looking at the upstream nodes. """ node = trunc_node @@ -112,10 +111,36 @@ def _get_signed_from_upstream(model, trunc_node): class AvgPoolAndTruncToQuantAvgPool(Transformation): - """ - Convert a section of nodes of the pattern: + """Convert a section of nodes of the pattern: AveragePool -> Mul (scalar) -> Trunc - To the FINN op: QuantAvgPool2d + To the FINN op: QuantAvgPool2d. + """ + + def apply(self, model): + opset_imports = model.get_opset_imports() + if "qonnx.custom_op.general" in opset_imports: + trunc_opset = opset_imports["qonnx.custom_op.general"] + elif "onnx.brevitas" in opset_imports: + trunc_opset = opset_imports["onnx.brevitas"] + else: + trunc_opset = 1 # Default to v1 if no opset found + if trunc_opset == 1: + model = model.transform(AvgPoolAndTruncv1ToQuantAvgPool()) + return model, False + elif trunc_opset == 2: + model = model.transform(AvgPoolAndTruncv2ToQuantAvgPool()) + return model, False + else: + raise NotImplementedError( + f"AvgPoolAndTruncToQuantAvgPool not implemented for " + f"Trunc opset version {trunc_opset}." + ) + + +class AvgPoolAndTruncv1ToQuantAvgPool(Transformation): + """Convert a section of nodes of the pattern: + AveragePool -> Mul (scalar) -> Trunc (v1) + To the FINN op: Div -> QuantAvgPool2d -> Mul. """ def apply(self, model): @@ -164,7 +189,7 @@ def apply(self, model): k_s = get_by_name(n.attribute, "kernel_shape") if k_s is None or len(k_s.ints) != 2 or len(set(k_s.ints)) != 1: raise ValueError( - "FINN only supports average pooling with " "2D square kernels." + "FINN only supports average pooling with 2D square kernels." ) k_s = k_s.ints[0] @@ -197,7 +222,7 @@ def apply(self, model): normalized_mode_string = rounding_mode.s.upper() if rounding_mode is None or normalized_mode_string != b"FLOOR": raise ValueError( - "The Trunc node must have the rounding_mode " "set to 'FLOOR'." + "The Trunc node must have the rounding_mode set to 'FLOOR'." ) for inp in t_node.input[1:]: if model.get_initializer(inp) is None: @@ -314,10 +339,9 @@ def apply(self, model): class AvgPoolAndTruncv2ToQuantAvgPool(Transformation): - """ - Convert a section of nodes of the pattern: + """Convert a section of nodes of the pattern: AveragePool -> Trunc (v2) - To the FINN op: Div -> QuantAvgPool2d -> Mul + To the FINN op: Div -> QuantAvgPool2d -> Mul. """ def apply(self, model): @@ -335,7 +359,7 @@ def apply(self, model): k_s = get_by_name(node.attribute, "kernel_shape") if k_s is None or len(k_s.ints) != 2 or len(set(k_s.ints)) != 1: raise ValueError( - "FINN only supports average pooling with " "2D square kernels." + "FINN only supports average pooling with 2D square kernels." ) k_s = k_s.ints[0] @@ -346,7 +370,7 @@ def apply(self, model): stride = get_by_name(node.attribute, "strides") if stride is None or len(stride.ints) != 2 or len(set(stride.ints)) != 1: raise ValueError( - "FINN only supports 2D strides with equal values in " "each direction." + "FINN only supports 2D strides with equal values in each direction." ) stride = stride.ints[0] @@ -355,7 +379,7 @@ def apply(self, model): normalized_mode_string = rounding_mode.s.upper() if rounding_mode is None or normalized_mode_string != b"FLOOR": raise ValueError( - "The Trunc node must have the rounding_mode " "set to 'FLOOR'." + "The Trunc node must have the rounding_mode set to 'FLOOR'." ) for inp in t_node.input[1:]: if model.get_initializer(inp) is None: diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index f034f49bc5..503554fff6 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -164,7 +164,7 @@ def get_liveness_threshold_cycles(): return int(os.getenv("LIVENESS_THRESHOLD", 1000000)) -def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str | Path: +def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str: """Creates a folder with given prefix to be used as a build dir. Use this function instead of tempfile.mkdtemp to ensure any generated files will survive on the host after the FINN Docker container exits.""" @@ -185,7 +185,7 @@ def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str | Path return str(tmpdir) -def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True): +def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True, print_stderr=True): """Helper function to launch a process in a way that facilitates logging stdout/stderr with Python loggers. Returns (cmd_out, cmd_err) if successful, raises CalledProcessError otherwise.""" @@ -204,7 +204,7 @@ def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True): # Handle stderr, depending on return code if process.returncode == 0: # Process completed successfully, log stderr only as WARNING - if cmd_err: + if cmd_err and print_stderr: log.warning(cmd_err) else: # Process failed, log stderr as ERROR diff --git a/src/finn/util/deprecated.py b/src/finn/util/deprecated.py index 6985653ac0..8593aa8388 100644 --- a/src/finn/util/deprecated.py +++ b/src/finn/util/deprecated.py @@ -1,13 +1,12 @@ """Implements a decorator to mark functions as deprecated.""" - import functools +import warnings from collections.abc import Callable from typing import ParamSpec, TypeVar - from finn.util.logging import log rT = TypeVar("rT") # return type # noqa: N816 -pT = ParamSpec("pT") # parameters type # noqa: N816 +pT = ParamSpec("pT") # parameters type # noqa: N816 def deprecated(func: Callable[pT, rT]) -> Callable[pT, rT]: diff --git a/src/finn/util/logging.py b/src/finn/util/logging.py index 3ae86f4dfd..50c73ef0ea 100644 --- a/src/finn/util/logging.py +++ b/src/finn/util/logging.py @@ -1,13 +1,12 @@ -"""Logging utilities for FINN using Rich console. - -This module provides logging configuration and utilities for FINN, -including a Rich console for formatted output. -""" - +"""Handle logging related functionality.""" import logging from rich.console import Console +from rich.progress import Progress, TaskID +from threading import Lock from types import TracebackType +log = logging.getLogger("finn_logger") + # Top level console used by logger # Can be retrieved to create for example status displays in Rich _RICH_CONSOLE = Console() @@ -36,8 +35,6 @@ def set_console(console: Console) -> None: _RICH_CONSOLE = console -log = logging.getLogger("finn_logger") - class LogDisabledConsole: """Use to get a console to use for Rich formatting without logging enabled.""" @@ -74,3 +71,65 @@ def __exit__( Exception traceback. """ log.disabled = False + + +class ThreadsafeProgressDisplay: + """Small helper to display multithreaded display bars. + Logging has to be disabled before usage. + """ + + def __init__( + self, tasks: list[str], totals: list[int | float], descriptions: list[str] + ) -> None: + """Create a new progress display.""" + self.lock = Lock() + self.state: dict[str, int | float] = dict.fromkeys(tasks, 0) + self.ptasks: dict[str, TaskID] = {} + self.totals_state = dict(zip(tasks, totals, strict=True)) + + self.tasks: list[str] = tasks + self.totals: list[float | int] = totals + self.descriptions: list[str] = descriptions + assert len(tasks) == len(totals) + assert len(totals) == len(descriptions) + + def start(self) -> None: + """Start the display.""" + self.progress = Progress(transient=True, redirect_stdout=False, redirect_stderr=False) + self.progress.start() + for task, desc, total in zip(self.tasks, self.descriptions, self.totals, strict=True): + self.ptasks[task] = self.progress.add_task(desc, total=total) + + def update(self, task: str, value: float | None = None, total: float | None = None) -> None: + """Update a value and the progress bar. If the task does not exist do nothing. + This is practical, because it means any method can update the progressbar + without any danger. Just the initially calling method must create a fitting display object. + + If value is None, the value is incremented once. + """ + if task in self.state and task in self.ptasks: + # NOTE: rich.progress at some point apparently became threadsafe, + # but just to be extra sure we add a lock here. + with self.lock: + if value is None: + self.state[task] += 1 + else: + self.state[task] = value + if total is not None: + self.totals_state[task] = total + self.progress.update( + self.ptasks[task], + completed=self.state[task], + refresh=True, + total=self.totals_state[task], + ) + + def stop(self) -> None: + """Stop the display.""" + self.progress.stop() + + def __enter__(self) -> None: + self.start() + + def __exit__(self, tp, vl, tb) -> None: + self.stop() diff --git a/src/finn/xsi/setup.py b/src/finn/xsi/setup.py index 609eabd88c..3aa6c91523 100644 --- a/src/finn/xsi/setup.py +++ b/src/finn/xsi/setup.py @@ -72,7 +72,7 @@ def get_build_paths() -> Tuple[List[str], str, List[str]]: compiler = "clang++" # Compile flags - extra_compile_args = ["--std=c++17", "-Wall", "-O3", "-shared", "-fPIC"] + extra_compile_args = ["--std=c++20", "-Wall", "-O3", "-shared", "-fPIC"] return include_dirs, compiler, extra_compile_args @@ -147,7 +147,7 @@ def build_xsi(force: bool = False, verbose: bool = True) -> bool: include_dirs, compiler, compile_args = get_build_paths() # Source files - source_files = ["xsi_bind.cpp", "xsi_finn.cpp"] + source_files = ["xsi_bind.cpp", "src/Port.cpp", "src/Design.cpp", "src/Kernel.cpp", "src/SharedLibrary.cpp"] # Build command cmd = [compiler] + compile_args @@ -155,6 +155,7 @@ def build_xsi(force: bool = False, verbose: bool = True) -> bool: # Add include directories for inc_dir in include_dirs: cmd.extend(["-I", inc_dir]) + cmd.extend(["-I", "./include"]) # Output file cmd.extend(["-o", "xsi.so"]) diff --git a/tests/fpgadataflow/test_bram_block_search.py b/tests/fpgadataflow/test_bram_block_search.py new file mode 100644 index 0000000000..2542fde769 --- /dev/null +++ b/tests/fpgadataflow/test_bram_block_search.py @@ -0,0 +1,465 @@ +"""Test BRAM block calculations and search algorithms.""" +# ruff: noqa: ANN201, SLF001 + +import pytest + +import math + +from finn.transformation.fpgadataflow.simulation import ( + calculate_bram_blocks, + calculate_bram_depth_range, +) + + +class TestBRAMBlockCalculations: + """Test BRAM block calculation functions.""" + + def test_calculate_bram_blocks_bitwidth_1(self) -> None: + """Test BRAM block calculation for 1-bit data.""" + assert calculate_bram_blocks(1, 1) == 1 + assert calculate_bram_blocks(16384, 1) == 1 + assert calculate_bram_blocks(16385, 1) == 2 + assert calculate_bram_blocks(32768, 1) == 2 + + def test_calculate_bram_blocks_bitwidth_2(self) -> None: + """Test BRAM block calculation for 2-bit data.""" + assert calculate_bram_blocks(1, 2) == 1 + assert calculate_bram_blocks(8192, 2) == 1 + assert calculate_bram_blocks(8193, 2) == 2 + assert calculate_bram_blocks(16384, 2) == 2 + + def test_calculate_bram_blocks_bitwidth_4(self) -> None: + """Test BRAM block calculation for 4-bit data.""" + assert calculate_bram_blocks(1, 4) == 1 + assert calculate_bram_blocks(4096, 4) == 1 + assert calculate_bram_blocks(4097, 4) == 2 + assert calculate_bram_blocks(8192, 4) == 2 + + def test_calculate_bram_blocks_bitwidth_9(self) -> None: + """Test BRAM block calculation for 9-bit data.""" + assert calculate_bram_blocks(1, 9) == 1 + assert calculate_bram_blocks(2048, 9) == 1 + assert calculate_bram_blocks(2049, 9) == 2 + + def test_calculate_bram_blocks_bitwidth_18(self) -> None: + """Test BRAM block calculation for 18-bit data.""" + assert calculate_bram_blocks(1, 18) == 1 + assert calculate_bram_blocks(1024, 18) == 1 + assert calculate_bram_blocks(1025, 18) == 2 + + def test_calculate_bram_blocks_wide_bitwidth_deep(self) -> None: + """Test BRAM block calculation for wide bitwidth with depth > 512.""" + # bitwidth = 40, depth = 1024 > 512 + # Uses formula: ⌈1024/1024⌉ * ⌈40/18⌉ = 1 * 3 = 3 + assert calculate_bram_blocks(1024, 40) == 3 + + def test_calculate_bram_blocks_wide_bitwidth_shallow(self) -> None: + """Test BRAM block calculation for wide bitwidth with depth <= 512.""" + # bitwidth = 40, depth = 512 <= 512 + # Uses formula: ⌈512/512⌉ * ⌈40/36⌉ = 1 * 2 = 2 + assert calculate_bram_blocks(512, 40) == 2 + + +class TestBRAMDepthRange: + """Test BRAM depth range inversion function.""" + + def test_depth_range_bitwidth_1(self) -> None: + """Test depth range calculation for 1-bit data.""" + min_d, max_d = calculate_bram_depth_range(1, 1) + assert min_d == 1 + assert max_d == 16384 + assert calculate_bram_blocks(min_d, 1) == 1 + assert calculate_bram_blocks(max_d, 1) == 1 + + min_d, max_d = calculate_bram_depth_range(2, 1) + assert min_d == 16385 + assert max_d == 32768 + assert calculate_bram_blocks(min_d, 1) == 2 + assert calculate_bram_blocks(max_d, 1) == 2 + + def test_depth_range_bitwidth_4(self) -> None: + """Test depth range calculation for 4-bit data.""" + min_d, max_d = calculate_bram_depth_range(1, 4) + assert min_d == 1 + assert max_d == 4096 + assert calculate_bram_blocks(min_d, 4) == 1 + assert calculate_bram_blocks(max_d, 4) == 1 + + def test_depth_range_bitwidth_5_valid_blocks(self) -> None: + """Test block count validation for bitwidth=5.""" + # bitwidth=5 uses ⌈5/9⌉=1 bitwidth factor (falls in <=9 range) + # So all blocks should be valid + min_d, max_d = calculate_bram_depth_range(1, 5) + assert max_d > 0, "1 block should be valid for bitwidth=5" + assert calculate_bram_blocks(min_d, 5) == 1 + assert calculate_bram_blocks(max_d, 5) == 1 + + min_d, max_d = calculate_bram_depth_range(2, 5) + assert max_d > 0, "2 blocks should be valid for bitwidth=5" + assert calculate_bram_blocks(min_d, 5) == 2 + assert calculate_bram_blocks(max_d, 5) == 2 + + def test_depth_range_bitwidth_10_valid_blocks(self) -> None: + """Test block count validation for bitwidth=10.""" + # bitwidth=10 uses ⌈10/18⌉=1 bitwidth factor (falls in <=18 range) + min_d, max_d = calculate_bram_depth_range(1, 10) + assert max_d > 0 + assert calculate_bram_blocks(min_d, 10) == 1 + + min_d, max_d = calculate_bram_depth_range(2, 10) + assert max_d > 0 + assert calculate_bram_blocks(min_d, 10) == 2 + + def test_depth_range_wide_bitwidth(self) -> None: + """Test depth range for wide bitwidths > 18.""" + # bitwidth=40 has two modes depending on depth + min_d, max_d = calculate_bram_depth_range(2, 40) + # Should use depth ≤ 512 mode: ⌈depth/512⌉ * ⌈40/36⌉ + # 2 blocks / 2 = 1 depth_blocks → (1, 512) + if max_d > 0: + assert max_d <= 512 + assert calculate_bram_blocks(min_d, 40) == 2 + + def test_depth_range_consistency_all_bitwidths(self) -> None: + """Test that all valid ranges actually produce the correct block count.""" + for bitwidth in range(1, 8192): + for blocks in range(1, 1024): + min_d, max_d = calculate_bram_depth_range(blocks, bitwidth) + if max_d > 0: # Valid configuration + # Verify both endpoints produce correct block count + assert calculate_bram_blocks(min_d, bitwidth) == blocks, ( + f"Min depth {min_d} for {blocks} blocks, " + f"bitwidth {bitwidth} produces wrong count" + ) + assert calculate_bram_blocks(max_d, bitwidth) == blocks, ( + f"Max depth {max_d} for {blocks} blocks, " + f"bitwidth {bitwidth} produces wrong count" + ) + + # Verify just outside the range produces different counts + if min_d > 1: + assert calculate_bram_blocks(min_d - 1, bitwidth) < blocks + assert calculate_bram_blocks(max_d + 1, bitwidth) > blocks + + +class TestGetValidBlockCounts: + """Test the _get_valid_block_counts helper method.""" + + def test_all_valid_bitwidth_1(self) -> None: + """Test that all block counts are valid for bitwidth=1.""" + from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation + + # Create dummy instance just to test the method + sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation) + + valid_blocks = sim._get_valid_block_counts(1, 10, 1) + assert valid_blocks == list(range(1, 11)) + + def test_wide_bitwidth_filtering(self) -> None: + """Test that some block counts may be invalid for wide bitwidths.""" + from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation + + sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation) + + # For bitwidth > 18, some block counts may be invalid + valid_blocks = sim._get_valid_block_counts(1, 20, 40) + # Verify all returned blocks produce valid ranges + for b in valid_blocks: + _, max_d = calculate_bram_depth_range(b, 40) + assert max_d > 0, f"Block {b} should produce valid range" + + def test_range_respects_bounds(self) -> None: + """Test that valid blocks respect min/max bounds.""" + from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation + + sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation) + + valid_blocks = sim._get_valid_block_counts(5, 15, 1) + assert min(valid_blocks) >= 5 + assert max(valid_blocks) <= 15 + assert len(valid_blocks) == 11 + + def test_empty_when_no_valid_in_range(self) -> None: + """Test that empty list is returned when no valid configs exist in range.""" + from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation + + sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation) + + # Test a scenario where the range might have no valid blocks + # (this is rare but the method should handle it) + valid_blocks = sim._get_valid_block_counts(100, 99, 5) # Invalid range + assert valid_blocks == [] + + +class TestExponentialBinarySearchLogic: + """Test the exponential + binary search algorithm logic (without actual simulation).""" + + def test_exponential_indices_progression(self) -> None: + """Test that exponential search correctly progresses through indices.""" + # Simulate the exponential index progression + valid_blocks = list(range(1, 101)) # 100 valid blocks + + # Exponential progression should be: 0, 1, 2, 4, 8, 16, 32, 64... + exp_idx = 0 + indices_checked = [] + + while exp_idx < len(valid_blocks) - 1: + indices_checked.append(exp_idx) + exp_idx = min(exp_idx * 2 if exp_idx > 0 else 1, len(valid_blocks) - 1) + + assert indices_checked == [0, 1, 2, 4, 8, 16, 32, 64] + + def test_binary_search_reduces_range(self) -> None: + """Test that binary search correctly narrows the range.""" + lower_idx = 0 + upper_idx = 99 + + iterations = 0 + while lower_idx < upper_idx: + mid_idx = (lower_idx + upper_idx) // 2 + # Simulate "success" for indices < 50 + if mid_idx < 50: + upper_idx = mid_idx + else: + lower_idx = mid_idx + 1 + iterations += 1 + + # Prevent infinite loop in test + if iterations > 20: + break + + assert lower_idx == upper_idx + assert iterations <= 7 # log2(100) ≈ 6.6 + + +class TestSRL16ELUTCalculations: + """Test SRL16E LUT calculation functions.""" + + def test_calculate_srl16e_luts_basic(self): + """Test basic SRL16E LUT calculations.""" + from finn.transformation.fpgadataflow.simulation import calculate_srl16e_luts + + # Formula: LUTs = ⌈depth/32⌉ * ⌈bitwidth/2⌉ + # depth=32, bitwidth=2: ⌈32/32⌉ * ⌈2/2⌉ = 1 * 1 = 1 + assert calculate_srl16e_luts(32, 2) == 1 + + # depth=64, bitwidth=2: ⌈64/32⌉ * ⌈2/2⌉ = 2 * 1 = 2 + assert calculate_srl16e_luts(64, 2) == 2 + + # depth=32, bitwidth=4: ⌈32/32⌉ * ⌈4/2⌉ = 1 * 2 = 2 + assert calculate_srl16e_luts(32, 4) == 2 + + # depth=33, bitwidth=2: ⌈33/32⌉ * ⌈2/2⌉ = 2 * 1 = 2 + assert calculate_srl16e_luts(33, 2) == 2 + + def test_calculate_srl16e_luts_various_bitwidths(self): + """Test SRL16E LUT calculations for various bitwidths.""" + from finn.transformation.fpgadataflow.simulation import calculate_srl16e_luts + + # Bitwidth 1: ⌈1/2⌉ = 1 + assert calculate_srl16e_luts(32, 1) == 1 + assert calculate_srl16e_luts(64, 1) == 2 + + # Bitwidth 3: ⌈3/2⌉ = 2 + assert calculate_srl16e_luts(32, 3) == 2 + assert calculate_srl16e_luts(64, 3) == 4 + + # Bitwidth 8: ⌈8/2⌉ = 4 + assert calculate_srl16e_luts(32, 8) == 4 + assert calculate_srl16e_luts(64, 8) == 8 + + def test_calculate_srl16e_luts_small_depths(self): + """Test SRL16E LUT calculations for small depths.""" + from finn.transformation.fpgadataflow.simulation import calculate_srl16e_luts + + # Small depths still use at least 1 LUT per bitwidth factor + assert calculate_srl16e_luts(2, 2) == 1 + assert calculate_srl16e_luts(16, 2) == 1 + assert calculate_srl16e_luts(31, 2) == 1 + + +class TestSRL16EDepthRange: + """Test SRL16E depth range inversion function.""" + + def test_depth_range_basic(self): + """Test basic depth range calculation for SRL16E.""" + from finn.transformation.fpgadataflow.simulation import ( + calculate_srl16e_depth_range, + calculate_srl16e_luts, + ) + + # 1 LUT, bitwidth=2 + min_d, max_d = calculate_srl16e_depth_range(1, 2) + assert min_d == 2 + assert max_d == 32 + assert calculate_srl16e_luts(min_d, 2) == 1 + assert calculate_srl16e_luts(max_d, 2) == 1 + + def test_depth_range_bitwidth_1(self): + """Test depth range for 1-bit data.""" + from finn.transformation.fpgadataflow.simulation import ( + calculate_srl16e_depth_range, + calculate_srl16e_luts, + ) + + min_d, max_d = calculate_srl16e_depth_range(1, 1) + assert min_d == 2 + assert max_d == 32 + assert calculate_srl16e_luts(min_d, 1) == 1 + assert calculate_srl16e_luts(max_d, 1) == 1 + + min_d, max_d = calculate_srl16e_depth_range(2, 1) + assert min_d == 33 + assert max_d == 64 + assert calculate_srl16e_luts(min_d, 1) == 2 + assert calculate_srl16e_luts(max_d, 1) == 2 + + def test_depth_range_invalid_odd_luts(self): + """Test that odd LUT counts are invalid for certain bitwidths.""" + from finn.transformation.fpgadataflow.simulation import calculate_srl16e_depth_range + + # Bitwidth=4: ⌈4/2⌉ = 2, so only even LUT counts are valid + _, max_d = calculate_srl16e_depth_range(1, 4) + assert max_d == 0, "1 LUT should be invalid for bitwidth=4" + + _, max_d = calculate_srl16e_depth_range(2, 4) + assert max_d > 0, "2 LUTs should be valid for bitwidth=4" + + def test_depth_range_consistency(self): + """Test that all valid ranges produce the correct LUT count.""" + from finn.transformation.fpgadataflow.simulation import ( + calculate_srl16e_depth_range, + calculate_srl16e_luts, + ) + + for bitwidth in [1, 2, 3, 4, 8, 16]: + for luts in range(1, 20): + min_d, max_d = calculate_srl16e_depth_range(luts, bitwidth) + if max_d > 0: # Valid configuration + # Verify both endpoints produce correct LUT count + assert calculate_srl16e_luts(min_d, bitwidth) == luts, ( + f"Min depth {min_d} for {luts} LUTs, " + f"bitwidth {bitwidth} produces wrong count" + ) + assert calculate_srl16e_luts(max_d, bitwidth) == luts, ( + f"Max depth {max_d} for {luts} LUTs, " + f"bitwidth {bitwidth} produces wrong count" + ) + + # Verify just outside the range produces different counts + if min_d > 2: + assert calculate_srl16e_luts(min_d - 1, bitwidth) < luts + assert calculate_srl16e_luts(max_d + 1, bitwidth) > luts + + +class TestNeedsMinimization: + """Test the needs_minimization method.""" + + # TODO: Maybe remove this behavior + def test_small_depths_no_minimization(self): + """Test that small depths don't need minimization.""" + from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation + + sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation) + sim.max_qsrl_depth = 256 + + # Depths <= 32 don't need minimization (fit in bitwidth/2 LUTs) + assert not sim._needs_minimization(32, 8) + assert not sim._needs_minimization(16, 8) + assert not sim._needs_minimization(2, 8) + + # TODO: Maybe remove this behavior + def test_qsrl_range_no_minimization(self): + """Test that depths within QSRL range don't need minimization.""" + from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation + + sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation) + sim.max_qsrl_depth = 256 + + # Depths within max_qsrl_depth don't need minimization + assert not sim._needs_minimization(128, 8) + assert not sim._needs_minimization(256, 8) + + def test_large_depths_need_minimization(self): + """Test that large depths with multiple BRAM blocks need minimization.""" + from finn.transformation.fpgadataflow.simulation import ( + RunLayerParallelSimulation, + calculate_bram_blocks, + calculate_bram_depth_range, + ) + + sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation) + sim.max_qsrl_depth = 256 + + # Test with specific known cases first + # bitwidth=8: 1 BRAM range is (1, 2048) + # Use depth > 2048 to get multiple blocks + depth = 5000 + bitwidth = 8 + blocks = calculate_bram_blocks(depth, bitwidth) + assert blocks > 1, f"depth={depth}, bitwidth={bitwidth} should use >1 BRAM" + assert sim._needs_minimization(depth, bitwidth) + + # bitwidth=18: 1 BRAM range is (1, 1024) + # Use depth > 1024 to get multiple blocks + depth = 3000 + bitwidth = 18 + blocks = calculate_bram_blocks(depth, bitwidth) + assert blocks > 1, f"depth={depth}, bitwidth={bitwidth} should use >1 BRAM" + assert sim._needs_minimization(depth, bitwidth) + + # Verify that depth with 1 BRAM doesn't need minimization + # when it's at minimum block count + depth = 1000 + bitwidth = 8 + blocks = calculate_bram_blocks(depth, bitwidth) + assert blocks == 1 + assert not sim._needs_minimization(depth, bitwidth) + + # Exhaustive test: check that depths with MORE than minimum BRAM blocks + # need minimization (unless very close to QSRL threshold) + for bw in range(1, 64): + # Find the minimum achievable block count for this bitwidth + min_blocks = None + max_d = 0 + test_blocks = 1 + while max_d == 0: + _, max_d = calculate_bram_depth_range(test_blocks, bw) + if max_d > 0: + min_blocks = test_blocks + break + test_blocks += 1 + + if min_blocks is None: + continue # Skip if no valid config found + + # Test depths that use more blocks than minimum + for depth in range(1, 8192): + blocks = calculate_bram_blocks(depth, bw) + + # Only expect minimization if blocks > minimum achievable + if blocks > min_blocks and depth > math.floor(sim.max_qsrl_depth * 1.1): + assert sim._needs_minimization(depth, bw), ( + f"depth={depth}, bw={bw}, blocks={blocks}, min_blocks={min_blocks} " + f"should need minimization" + ) + + def test_minimum_bram_edge_case(self): + """Test edge case at minimum BRAM blocks.""" + from finn.transformation.fpgadataflow.simulation import RunLayerParallelSimulation + + sim = RunLayerParallelSimulation.__new__(RunLayerParallelSimulation) + sim.max_qsrl_depth = 256 + + # A depth that's just slightly above max_qsrl_depth with minimum BRAM blocks + # The behavior depends on whether it's deemed too close to optimize + depth = 300 + bitwidth = 1 + + # Verify the method executes without error + result = sim._needs_minimization(depth, bitwidth) + assert isinstance(result, bool) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])