From b92fd749fe829c669c9d3ae13be4ecb8abd703c2 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 7 Aug 2025 09:24:50 -0500 Subject: [PATCH 01/14] laying the foundation for oneapi accelerator --- hls4ml/backends/oneapi/oneapi_backend.py | 4 +- .../backends/oneapi_accelerator/__init__.py | 0 .../oneapi_accelerator_backend.py | 18 +++ hls4ml/templates/oneapi/firmware/defines.h | 1 - .../firmware/nnet_utils/nnet_data_movement.h | 142 ++++++++++++++++++ .../oneapi/firmware/nnet_utils/nnet_helpers.h | 21 --- .../oneapi/firmware/nnet_utils/nnet_types.h | 11 ++ hls4ml/templates/oneapi/myproject_bridge.cpp | 2 +- hls4ml/templates/oneapi/myproject_test.cpp | 1 + 9 files changed, 175 insertions(+), 25 deletions(-) create mode 100644 hls4ml/backends/oneapi_accelerator/__init__.py create mode 100644 hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py index 9f3269f90b..b2c4e807aa 100644 --- a/hls4ml/backends/oneapi/oneapi_backend.py +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -17,8 +17,8 @@ class OneAPIBackend(FPGABackend): - def __init__(self): - super().__init__('oneAPI') + def __init__(self, name='oneAPI'): # the default name should be used in most cases + super().__init__(name) self._register_layer_attributes() self._register_flows() diff --git a/hls4ml/backends/oneapi_accelerator/__init__.py b/hls4ml/backends/oneapi_accelerator/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py new file mode 100644 index 0000000000..52438de1b5 --- /dev/null +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py @@ -0,0 +1,18 @@ +from hls4ml.backends import OneAPIBackend +from hls4ml.model.flow import get_flow, register_flow + + +class OneAPIAcceleratorBackend(OneAPIBackend): + """ + This is the backend to run oneAPI code on an accelerator using the oneAPI framework. + """ + + def __init__(self): + super().__init__(name='OneAPIAccelerator') + + def _register_flows(self): + writer_passes = ['make_stamp', 'oneapiaccelerator:write_hls'] + self._writer_flow = register_flow('write', writer_passes, requires=['oneapi:ip'], backend=self.name) + + ip_flow_requirements = get_flow('oneapi:ip').requires.copy() + self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) diff --git a/hls4ml/templates/oneapi/firmware/defines.h b/hls4ml/templates/oneapi/firmware/defines.h index b2fc5bdd9a..05de507dcd 100644 --- a/hls4ml/templates/oneapi/firmware/defines.h +++ b/hls4ml/templates/oneapi/firmware/defines.h @@ -2,7 +2,6 @@ #define DEFINES_H_ #include -#include #include #include #include diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h new file mode 100644 index 0000000000..58e9f5e240 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h @@ -0,0 +1,142 @@ +#ifndef NNET_DATA_MOVEMENT_H +#define NNET_DATA_MOVEMENT_H + +#include +#include + +// This file defines the methods to transfer the data to the kernel. In the HLS flow, +// these are really part of the testbench. However, in the accelerator (BSP) flow, they are +// actual kernels that are deployed in hardware. + +namespace nnet { + +////////////////////////////////////////////////////////////////////////////// +// These are the simple, testbench-only versions for the HLS flow +////////////////////////////////////////////////////////////////////////////// +template void convert_data(sycl::queue &q, srcType *src) { + constexpr auto dstTypeSize = std::tuple_size::value_type>{}; + for (size_t i = 0; i < SIZE / dstTypeSize; i++) { + typename ExtractPipeType::value_type ctype; + for (size_t j = 0; j < dstTypeSize; j++) { + ctype[j] = src[i * dstTypeSize + j]; + } + dest_pipe::write(q, ctype); + } +} + +template void convert_data_back(sycl::queue &q, dstType *dst) { + constexpr auto srcTypeSize = std::tuple_size::value_type>{}; + for (size_t i = 0; i < SIZE / srcTypeSize; i++) { + auto ctype = src_pipe::read(q); + for (size_t j = 0; j < srcTypeSize; j++) { + dst[i * srcTypeSize + j] = ctype[j].to_double(); + } + } +} + +////////////////////////////////////////////////////////////////////////////// +// The ones below can be used both in testbenches and in the accelerator flow +////////////////////////////////////////////////////////////////////////////// +#if !defined(IS_BSP) +// Definition for buffer locations for Avalon MM host. +inline constexpr unsigned kInputBufferLocation = 0; +inline constexpr unsigned kOutputBufferLocation = 1; +#endif + +// Implementation of a direct memory access kernel. Move data from source, convert, +// and send to the sink. Adaptive to SYCL HLS and hardware acceleration flow. +template struct DMA_convert_data { +#if !defined(IS_BSP) + // When targeting a device family, we instantiate an Avalon Memory Mapped Host for + // data transaction between host and the DMA kernel during emulation and simulation. + sycl::ext::oneapi::experimental::annotated_arg< + src_T *, + decltype(sycl::ext::oneapi::experimental::properties{ + sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>, + sycl::ext::intel::experimental::buffer_location, + sycl::ext::intel::experimental::read_write_mode_read, sycl::ext::intel::experimental::wait_request_requested})> +#else + // When targeting oneAPI BSP, we can use USM pointer to access host memory. + src_T *const +#endif + src; + size_t num_iteration; + + [[intel::kernel_args_restrict]] void operator()() const { + +#if defined(IS_BSP) + // Access data using host pointer. + sycl::ext::intel::host_ptr src_ptr(src); +#else + // Host allocation is not supported when targeting an FPGA family or part number. + src_T *src_ptr(src); +#endif + // First, extract the PipeDataT from the pipe + using PipeDataType = typename nnet::ExtractPipeType::value_type; + // Then, extract the DataT from StreamingBeat + using DstDataType = typename nnet::ExtractDataType::value_type; + constexpr auto dstTypeSize = std::tuple_size{}; + + [[intel::fpga_register]] typename nnet::ExtractPipeType::value_type packet; + + // Keep sending data to the input layer and keep the kernels running. + for (size_t i = 0; i < num_iteration; i++) { + #pragma unroll + for (size_t j = 0; j < dstTypeSize; j++) { + packet.data[j] = src_ptr[i * dstTypeSize + j]; + } + packet.sop = (i == 0); + // Assert end-of-packet signal after the last iteration. + // All down-stream kernels will stop seeing eop. + packet.eop = (i == (num_iteration - 1)); + dest_pipe::write(packet); + } + } +}; + +// Symmetrical to the DMA_convert_data above, this DMA drains the output pipe and +// writes result to memory. +template struct DMA_convert_data_back { +#if !defined(IS_BSP) + // Without BSP, instantiate an Avalon Memory Mapped Host to write to host. + sycl::ext::oneapi::experimental::annotated_arg< + dst_T *, + decltype(sycl::ext::oneapi::experimental::properties{ + sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>, + sycl::ext::intel::experimental::buffer_location, + sycl::ext::intel::experimental::read_write_mode_write, sycl::ext::intel::experimental::wait_request_requested})> +#else + // USM pointer, otherwise. + dst_T *const +#endif + dst; + size_t num_iteration; + + [[intel::kernel_args_restrict]] void operator()() const { +#if defined(IS_BSP) + sycl::ext::intel::host_ptr dst_ptr(dst); +#else + dst_T *dst_ptr(dst); +#endif + // First, extract the PipeDataT from the pipe + using PipeDataType = typename nnet::ExtractPipeType::value_type; + // Then, extract the DataT from StreamingBeat + using SrcDataType = typename nnet::ExtractDataType::value_type; + constexpr auto srcTypeSize = std::tuple_size{}; + + [[intel::fpga_register]] typename nnet::ExtractPipeType::value_type packet; + + // Drain the output pipe and write result to memory. + for (size_t i = 0; i < num_iteration; i++) { + packet = src_pipe::read(); + #pragma unroll + for (size_t j = 0; j < srcTypeSize; j++) { + dst_ptr[i * srcTypeSize + j] = static_cast(packet.data[j].to_double()); + } + } + } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h index c7af2e7a68..e5b451655a 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h @@ -12,27 +12,6 @@ namespace nnet { -template void convert_data(sycl::queue &q, srcType *src) { - constexpr auto dstTypeSize = std::tuple_size::value_type>{}; - for (size_t i = 0; i < SIZE / dstTypeSize; i++) { - typename ExtractPipeType::value_type ctype; - for (size_t j = 0; j < dstTypeSize; j++) { - ctype[j] = src[i * dstTypeSize + j]; - } - dest_pipe::write(q, ctype); - } -} - -template void convert_data_back(sycl::queue &q, dstType *dst) { - constexpr auto srcTypeSize = std::tuple_size::value_type>{}; - for (size_t i = 0; i < SIZE / srcTypeSize; i++) { - auto ctype = src_pipe::read(q); - for (size_t j = 0; j < srcTypeSize; j++) { - dst[i * srcTypeSize + j] = ctype[j].to_double(); - } - } -} - extern bool trace_enabled; extern std::map *trace_outputs; extern size_t trace_type_size; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h index 8cf883c1d5..a35bba17bf 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h @@ -8,6 +8,8 @@ #include #include +#include // Streaming Beat and pipe properties. + namespace nnet { // Define the pipe type that we use @@ -34,6 +36,15 @@ struct ExtractPipeType struct ExtractDataType { typedef T value_type; }; + +// Specialization on oneAPI StreamingBeat type. +template +struct ExtractDataType> { + typedef DataT value_type; +}; + /* * HLS Shift Register Implementation * To verify a shift register is used in hardware, go to report.html > Area Analysis of System diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp index ddad1d054b..fa73db7c2a 100644 --- a/hls4ml/templates/oneapi/myproject_bridge.cpp +++ b/hls4ml/templates/oneapi/myproject_bridge.cpp @@ -2,7 +2,7 @@ #define MYPROJECT_BRIDGE_H_ #include "firmware/myproject.h" -#include "firmware/nnet_utils/nnet_helpers.h" +#include "firmware/nnet_utils/nnet_data_movement.h" #include #include diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp index 82fb60d2f8..dc570dcb07 100644 --- a/hls4ml/templates/oneapi/myproject_test.cpp +++ b/hls4ml/templates/oneapi/myproject_test.cpp @@ -7,6 +7,7 @@ #include #include "firmware/myproject.h" +#include "firmware/nnet_utils/nnet_data_movement.h" #include "firmware/parameters.h" #include From ad6adfa433a4939bb850ac53100ef56406a897e6 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 7 Aug 2025 16:46:27 -0500 Subject: [PATCH 02/14] snapshot adding files --- hls4ml/templates/oneapi/CMakeLists.txt | 6 +- hls4ml/templates/oneapi/firmware/myproject.h | 2 +- .../oneapi_accelerator/firmware/myproject.h | 29 ++ .../oneapi_accelerator/myproject_test.cpp | 199 +++++++++++ hls4ml/writer/oneapi_accelerator_writer.py | 318 ++++++++++++++++++ 5 files changed, 552 insertions(+), 2 deletions(-) create mode 100644 hls4ml/templates/oneapi_accelerator/firmware/myproject.h create mode 100644 hls4ml/templates/oneapi_accelerator/myproject_test.cpp create mode 100644 hls4ml/writer/oneapi_accelerator_writer.py diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt index 5bce2aaf84..0837a2976b 100644 --- a/hls4ml/templates/oneapi/CMakeLists.txt +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -39,14 +39,18 @@ set(LIBRARY_NAME myproject-${LIB_STAMP}) # specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP. if(NOT DEFINED FPGA_DEVICE) set(FPGA_DEVICE "Agilex7") + set(BSP_FLAG "") endif() +# Set the target to a BSP if we target an actual accelerator board. +# hls-fpga-machine-learning insert oneapi_bsp_cmake_flag + # Use cmake -DUSER_FPGA_FLAGS= to set extra flags for FPGA backend # compilation. set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS}) # Use cmake -DUSER_FLAGS= to set extra flags for general compilation. -set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS}) +set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS} ${BSP_FLAG}) # Use cmake -DUSER_INCLUDE_PATHS= to set extra paths for general # compilation. diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h index 082ae5dc8c..ec3bf146f5 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.h +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -5,7 +5,7 @@ // This file defines the interface to the kernel -// currently this is fixed +// this is for both the internal pipes and the interface for the HLS (ip) flow using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>)); // Need to declare the input and output pipes diff --git a/hls4ml/templates/oneapi_accelerator/firmware/myproject.h b/hls4ml/templates/oneapi_accelerator/firmware/myproject.h new file mode 100644 index 0000000000..ef0d458a4d --- /dev/null +++ b/hls4ml/templates/oneapi_accelerator/firmware/myproject.h @@ -0,0 +1,29 @@ +#ifndef MYPROJECT_H_ +#define MYPROJECT_H_ + +#include "defines.h" + +// This file defines the interface to the kernel + +// currently this is for the internal pipes, not the interface, in accelerator flow +using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>)); + +// Pipe properties for host pipes. Host pipes connect to the data source DMA and sink DMA. +// They are connected to the first and the last layer to stream data into and out from the kernel. +using HostPipePropertiesT = decltype(sycl::ext::oneapi::experimental::properties( + sycl::ext::intel::experimental::ready_latency<0>, sycl::ext::intel::experimental::bits_per_symbol<16>, + sycl::ext::intel::experimental::uses_valid, sycl::ext::intel::experimental::first_symbol_in_high_order_bits, + sycl::ext::intel::experimental::protocol_avalon_streaming_uses_ready)); + +// Need to declare the input and output pipes + +// hls-fpga-machine-learning insert inputs +// hls-fpga-machine-learning insert outputs + +class MyProjectID; + +struct MyProject { + SYCL_EXTERNAL void operator()() const; +}; + +#endif diff --git a/hls4ml/templates/oneapi_accelerator/myproject_test.cpp b/hls4ml/templates/oneapi_accelerator/myproject_test.cpp new file mode 100644 index 0000000000..f90129db4c --- /dev/null +++ b/hls4ml/templates/oneapi_accelerator/myproject_test.cpp @@ -0,0 +1,199 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "firmware/myproject.h" +#include "firmware/nnet_utils/nnet_data_movement.h" +#include "firmware/parameters.h" + +#include + +#if (__INTEL_CLANG_COMPILER < 20250000) +#include +#endif + +#include "exception_handler.hpp" +// hls-fpga-machine-learning insert bram + +#define CHECKPOINT 5000 + +// Functions that reads input and prediction data from files. +// Returns `true` if files are read successfully and not empty. +// Returns `false` otherwise. +bool prepare_data_from_file(std::string &fin_path, std::string &fpr_path, std::vector> &inputs, + std::vector> &predictions) { + // load input data from text file + std::ifstream fin(fin_path.c_str()); + // load predictions from text file + std::ifstream fpr(fpr_path.c_str()); + + std::string iline; + std::string pline; + + if (fin.is_open() && fpr.is_open()) { + size_t num_iterations = 0; + + // Prepare input data from file. Load predictions from file. + for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) { + if (num_iterations % CHECKPOINT == 0) { + std::cout << "Processing input " << num_iterations << std::endl; + } + + std::vector in; + std::vector pr; + float current; + + std::stringstream ssin(iline); + while (ssin >> current) { + in.push_back(current); + } + + std::stringstream sspred(pline); + while (sspred >> current) { + pr.push_back(current); + } + + std::copy(pr.cbegin(), pr.cend(), predictions.back().begin()); + std::copy(in.cbegin(), in.cend(), inputs.back().begin()); + } + fin.close(); + fpr.close(); + if (inputs.empty()) + return false; + else + return true; + } else { + return false; + } +} + +int main(int argc, char **argv) { + +#if FPGA_SIMULATOR +#define NUM_ITERATIONS 5 + auto selector = sycl::ext::intel::fpga_simulator_selector_v; +#elif FPGA_HARDWARE +#define NUM_ITERATIONS 100 + auto selector = sycl::ext::intel::fpga_selector_v; +#else // #if FPGA_EMULATOR +#define NUM_ITERATIONS 10 + auto selector = sycl::ext::intel::fpga_emulator_selector_v; +#endif + + sycl::queue q(selector, fpga_tools::exception_handler, sycl::property::queue::enable_profiling{}); + + auto device = q.get_device(); + + // make sure the device supports USM host allocations + if (!device.has(sycl::aspect::usm_host_allocations)) { + std::cerr << "This design must either target a board that supports USM " + "Host/Shared allocations, or IP Component Authoring. " + << std::endl; + std::terminate(); + } + + std::cout << "Running on device: " << device.get_info().c_str() << std::endl; + + std::string INPUT_FILE = "tb_data/tb_input_features.dat"; + std::string PRED_FILE = "tb_data/tb_output_predictions.dat"; + std::string RESULTS_LOG = "tb_data/results.log"; + std::ofstream fout(RESULTS_LOG); + + // Allocate vectors on stack to hold data from files temporarily. + std::vector> inputs; + std::vector> predictions; + bool file_valid = prepare_data_from_file(INPUT_FILE, PRED_FILE, inputs, predictions); + unsigned int num_iterations; + if (file_valid) { + num_iterations = inputs.size(); + } else { + num_iterations = NUM_ITERATIONS; + } + + // hls-fpga-machine-learning insert runtime contant + + try { + // Allocate host memory if BSP is in use. + float *vals = sycl::malloc_host(kInputSz, q); + if (vals == nullptr) { + std::cerr << "ERROR: host allocation failed for input\n"; + fout.close(); + return 1; + } + float *outputs = sycl::malloc_host(kOutputSz, q); + if (outputs == nullptr) { + std::cerr << "ERROR: host allocation failed for output\n"; + fout.close(); + return 1; + } + + if (file_valid) { + // Start always-run streaming kernel here, instead of inside a loop. + q.single_task(MyProject{}); + + // hls-fpga-machine-learning insert data + + // hls-fpga-machine-learning convert output + + // Print output from kernel and from prediction file. + for (int i = 0; i < num_iterations; i++) { + for (int j = 0; j < kOutLayerSize; j++) { + fout << outputs[i * kOutLayerSize + j] << " "; + } + fout << std::endl; + if (i % CHECKPOINT == 0) { + std::cout << "Predictions" << std::endl; + // hls-fpga-machine-learning insert predictions + for (auto predval : predictions[i]) { + std::cout << predval << " "; + } + std::cout << std::endl; + std::cout << "Quantized predictions" << std::endl; + // hls-fpga-machine-learning insert quantized + for (int j = 0; j < kOutLayerSize; j++) { + std::cout << outputs[i * kOutLayerSize + j] << " "; + } + std::cout << std::endl; + } + } + } else { + std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations + << " invocations." << std::endl; + q.single_task(MyProject{}); + // hls-fpga-machine-learning insert top-level-function + // hls-fpga-machine-learning insert zero + // hls-fpga-machine-learning convert output + for (int i = 0; i < num_iterations; i++) { + for (int j = 0; j < kOutLayerSize; j++) { + std::cout << outputs[i * kOutLayerSize + j] << " "; + fout << outputs[i * kOutLayerSize + j] << " "; + } + std::cout << std::endl; + fout << std::endl; + } + } + sycl::free(vals, q); + sycl::free(outputs, q); + fout.close(); + std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; + } catch (sycl::exception const &e) { + // Catches exceptions in the host code. + std::cerr << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.code().value() == CL_DEVICE_NOT_FOUND) { + std::cerr << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cerr << "Run sys_check in the oneAPI root directory to verify.\n"; + std::cerr << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + return 0; +} diff --git a/hls4ml/writer/oneapi_accelerator_writer.py b/hls4ml/writer/oneapi_accelerator_writer.py new file mode 100644 index 0000000000..ee836cc516 --- /dev/null +++ b/hls4ml/writer/oneapi_accelerator_writer.py @@ -0,0 +1,318 @@ +import os +from shutil import copyfile + +from hls4ml.utils.string_utils import convert_to_pascal_case +from hls4ml.writer.oneapi_writer import OneAPIWriter + +config_filename = 'hls4ml_config.yml' + + +class OneAPIAcceleratorWriter(OneAPIWriter): + + def write_project_cpp(self, model): + """Write the main architecture source file (myproject.cpp) + + Args: + model (ModelGraph): the hls4ml model. + """ + project_name = model.config.get_project_name() + + filedir = os.path.dirname(os.path.abspath(__file__)) + with ( + open(os.path.join(filedir, '../templates/oneapi/firmware/myproject.cpp')) as f, + open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.cpp', 'w') as fout, + ): + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + if len(model_brams) != 0: + raise NotImplementedError("Weights on the interface is currently not supported") + + io_type = model.config.get_config_value('IOType') + indent = ' ' + + for line in f.readlines(): + # Add headers to weights and biases + if 'myproject' in line: + newline = line.replace('myproject', project_name) + elif 'MyProject' in line: + newline = line.replace('MyProject', convert_to_pascal_case(project_name)) + + # oneAPI pipes need to be declared and passed as template parameters + elif '// hls-fpga-machine-learning insert inter-task pipes' in line: + newline = line + if io_type == 'io_stream': + for layer in model.get_layers(): + vars = layer.get_variables() + for var in vars: + if var not in model_inputs and var not in model_outputs: + newline += var.declare_cpp() + + # Read in inputs + elif '// hls-fpga-machine-learning read in' in line: + newline = line + if io_type == 'io_parallel': + restartable_kernel_loop = f"bool keep_going = true;\n\n" f"{indent}while (keep_going) {{\n" + newline += indent + restartable_kernel_loop + for inp in model_inputs: + newline += indent * 2 + f'auto {inp.name}_beat = {inp.pipe_name}::read();\n' + newline += indent * 2 + f'auto {inp.name} = {inp.name}_beat.data;\n' + # for streaming we don't need to read it in + + # Insert weights + elif '// hls-fpga-machine-learning insert weights' in line: + newline = line + for layer in model.get_layers(): + for w in layer.get_weights(): + if w not in model_brams: + newline += f'#include "weights/{w.name}.h"\n' + + # Insert task sequences + elif '// hls-fpga-machine-learning declare task sequences' in line: + if io_type == 'io_stream': # only need this for io_stream + newline = line + for layer in model.get_layers(): + ts = layer.get_attr('tast_sequence_cpp') + if ts: + newline += ' ' + ts + '\n' + else: + newline = indent + line + + # Neural net instantiation + elif '// hls-fpga-machine-learning insert layers' in line: + if io_type == 'io_parallel': + newline = indent + line + '\n' + else: + newline = line + '\n' + for layer in model.get_layers(): + if io_type != 'io_stream': + vars = layer.get_variables() + for var in vars: + if var not in model_inputs: + def_cpp = var.definition_cpp() + if def_cpp is not None: + newline += indent * 2 + def_cpp + ';\n' + func = ( + layer.get_attr('function_cpp') + if io_type == 'io_parallel' + else layer.get_attr('stream_function_cpp') + ) + if func: + newline += (indent * 2 if io_type == 'io_parallel' else indent) + func + '\n' + if model.config.trace_output and layer.get_attr('trace', False): + newline += '#ifndef HLS_SYNTHESIS\n' + for var in vars: + newline += ' nnet::save_layer_output<{}>({}, "{}", {});\n'.format( + var.type.name, var.name, layer.name, var.size_cpp() + ) + newline += '#endif\n' + + # Write the output + elif '// hls-fpga-machine-learning return' in line: + newline = line + if io_type == 'io_parallel': + newline = indent + newline + for out in model_outputs: + out_beat = f"{out.name}_beat" + newline += ( + indent * 2 + f'typename nnet::ExtractPipeType<{out.pipe_name}>::value_type {out_beat};\n' + ) + newline += indent * 2 + f'{out_beat}.data = {out.name};\n' + newline += indent * 2 + f'{out.pipe_name}::write({out_beat});\n' + newline += indent * 2 + '// stops the kernel when the last input seen.\n' + newline += indent * 2 + f'keep_going = !{model_inputs[0].name}_beat.eop;\n' + newline += f"{indent}}}\n" + # don't need to add anything in io_stream + + # Just copy line + else: + newline = line + + fout.write(newline) + + def write_project_header(self, model): + """Write the main architecture header file (myproject.h) + + Args: + model (ModelGraph): the hls4ml model. + """ + + project_name = model.config.get_project_name() + + filedir = os.path.dirname(os.path.abspath(__file__)) + with ( + open(os.path.join(filedir, '../templates/oneapi_accelerator/firmware/myproject.h')) as f, + open(f'{model.config.get_output_dir()}/src/firmware/{project_name}.h', 'w') as fout, + ): + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + # model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + # io_parallel and io_stream instantiate the top-level function differently (io_stream not yet supported) + # io_type = model.config.get_config_value('IOType') + # indent = ' ' + # brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams]) + + for line in f.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', format(project_name.upper())) + + elif 'myproject' in line: + newline = line.replace('myproject', project_name) + + elif 'MyProject' in line: + newline = line.replace('MyProject', convert_to_pascal_case(project_name)) + + # Declarations for the inputs. May need modification when io_stream is supported + elif '// hls-fpga-machine-learning insert inputs' in line: + newline = line + for inp in model_inputs: + newline += inp.declare_cpp() + + # and declareations for the outputs + elif '// hls-fpga-machine-learning insert outputs' in line: + newline = line + for out in model_outputs: + newline += out.declare_cpp() + + # Simply copy line, if no inserts are required + else: + newline = line + + fout.write(newline) + + def write_test_bench(self, model): + """Write the testbench + + Args: + model (ModelGraph): the hls4ml model. + """ + # TODO - This function only works with one model input + # (NOT one data point - it works as expected with multiple data points) + + # copy the exception handler + filedir = os.path.dirname(os.path.abspath(__file__)) + srcpath = os.path.join(filedir, '../templates/oneapi/exception_handler.hpp') + dstpath = f'{model.config.get_output_dir()}/src/exception_handler.hpp' + copyfile(srcpath, dstpath) + + project_name = model.config.get_project_name() + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + + if len(model_brams) != 0: + raise NotImplementedError("Weights on the interface is currently not supported") + + if len(model_inputs) != 1 or len(model_outputs) != 1: + print("The testbench supports only single input arrays and single output arrays.") + print("Please modify it before using it.") + + if not os.path.exists(f'{model.config.get_output_dir()}/tb_data/'): + os.mkdir(f'{model.config.get_output_dir()}/tb_data/') + + input_data = model.config.get_config_value('InputData') + output_predictions = model.config.get_config_value('OutputPredictions') + + if input_data: + if input_data[-3:] == "dat": + copyfile(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat') + else: + self.__make_dat_file(input_data, f'{model.config.get_output_dir()}/tb_data/tb_input_features.dat') + + if output_predictions: + if output_predictions[-3:] == "dat": + copyfile(output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat') + else: + self.__make_dat_file( + output_predictions, f'{model.config.get_output_dir()}/tb_data/tb_output_predictions.dat' + ) + + with ( + open(os.path.join(filedir, '../templates/oneapi_accelerator/myproject_test.cpp')) as f, + open(f'{model.config.get_output_dir()}/src/{project_name}_test.cpp', 'w') as fout, + ): + for line in f.readlines(): + indent = ' ' * (len(line) - len(line.lstrip(' '))) + + if 'myproject' in line: + newline = line.replace('myproject', project_name) + elif 'MyProject' in line: + newline = line.replace('MyProject', convert_to_pascal_case(project_name)) + + elif '// hls-fpga-machine-learning insert bram' in line: + newline = line + for bram in model_brams: + newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + elif '// hls-fpga-machine-learning insert runtime contant' in line: + newline = line + insert_constant_lines = ( + f'{indent}const size_t kInputSz = {model_inputs[0].size_cpp()} * num_iterations;\n' + f'{indent}const size_t kOutputSz = {model_outputs[0].size_cpp()} * num_iterations;\n' + f'{indent}const size_t kInputLayerSize = {model_inputs[0].size_cpp()};\n' + f'{indent}const size_t kOutLayerSize = {model_outputs[0].size_cpp()};\n' + ) + newline += insert_constant_lines + elif '// hls-fpga-machine-learning insert zero' in line: + newline = line + inp = model_inputs[0] + insert_zero_lines = ( + f'{indent}for (int j = 0 ; j < kInputSz; j++)\n' + f'{indent} vals[j] = 0.0;\n' + f'{indent}q.single_task(nnet::DMA_convert_data{{vals, num_iterations}});\n' + ) + newline += insert_zero_lines + elif '// hls-fpga-machine-learning insert data' in line: + newline = line + inp = model_inputs[0] + insert_data_lines = ( + f'{indent}for (int i = 0; i < num_iterations; i++)\n' + f'{indent} for (int j = 0 ; j < kInputLayerSize; j++)\n' + f'{indent} vals[i * kInputLayerSize + j] = inputs[i][j]; \n' + f'{indent}q.single_task(nnet::DMA_convert_data{{vals, num_iterations}});\n' + ) + newline += insert_data_lines + elif '// hls-fpga-machine-learning convert output' in line: + newline = line + out = model_outputs[0] + newline += f'{indent}q.single_task(nnet::DMA_convert_data_back<{out.pipe_name}, float>' + newline += '{outputs, num_iterations}).wait();\n' + else: + newline = line + + fout.write(newline) + + def write_build_script(self, model): + """Write the build scripts (Makefile, build_lib.sh) + + Args: + model (ModelGraph): the hls4ml model. + """ + + # Makefile + filedir = os.path.dirname(os.path.abspath(__file__)) + device = model.config.get_config_value('Part') + period = model.config.get_config_value('ClockPeriod') + hyper = model.config.get_config_value('HyperoptHandshake') + with ( + open(os.path.join(filedir, '../templates/oneapi/CMakeLists.txt')) as f, + open(f'{model.config.get_output_dir()}/CMakeLists.txt', 'w') as fout, + ): + for line in f.readlines(): + line = line.replace('myproject', model.config.get_project_name()) + line = line.replace('mystamp', model.config.get_config_value('Stamp')) + + if 'set(FPGA_DEVICE' in line: + line = f' set(FPGA_DEVICE "{device}")\n' + + if model.config.get_config_value('UseOneAPIBSP'): + if 'hls-fpga-machine-learning insert oneapi_bsp_cmake_flag' in line: + line = 'set(BSP_FLAG "-DIS_BSP")' + + if 'set(USER_FPGA_FLAGS' in line: + line += f'set(USER_FPGA_FLAGS -Xsclock={period}ns; ${{USER_FPGA_FLAGS}})\n' + if not hyper: + line += 'set(USER_FPGA_FLAGS -Xsoptimize=latency; ${USER_FPGA_FLAGS})\n' + + fout.write(line) From 0ae77bbe7df67d9d41f2cee38941d2dcd62e00ab Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 8 Aug 2025 09:41:07 -0500 Subject: [PATCH 03/14] some steps towards oneapi accelerator --- .../oneapi_accelerator_backend.py | 20 +++++++ .../oneapi_accelerator_types.py | 33 ++++++++++ .../oneapi_accelerator/passes/__init__.py | 0 .../passes/transform_types.py | 60 +++++++++++++++++++ 4 files changed, 113 insertions(+) create mode 100644 hls4ml/backends/oneapi_accelerator/oneapi_accelerator_types.py create mode 100644 hls4ml/backends/oneapi_accelerator/passes/__init__.py create mode 100644 hls4ml/backends/oneapi_accelerator/passes/transform_types.py diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py index 52438de1b5..768cf3876f 100644 --- a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py @@ -16,3 +16,23 @@ def _register_flows(self): ip_flow_requirements = get_flow('oneapi:ip').requires.copy() self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) + + def create_initial_config( + self, part, clock_period=5, hyperopt_handshake=False, io_type='io_parallel', write_tar=False, **_ + ): + """Create initial configuration of the oneAPI backend. + + Args: + part (str): The path to the board support file to be used. + clock_period (int, optional): The clock period in ns. Defaults to 5. + hyperopt_handshake (bool, optional): Should hyper-optimized handshaking be used? Defaults to False + io_type (str, optional): Type of implementation used. One of + 'io_parallel' or 'io_stream'. Defaults to 'io_parallel'. + write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to False. + + Returns: + dict: initial configuration. + """ + config = super().create_initial_config(part, clock_period, hyperopt_handshake, io_type, write_tar, **_) + config['UseOneAPIBSP'] = True + return config diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_types.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_types.py new file mode 100644 index 0000000000..b96fe93c5c --- /dev/null +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_types.py @@ -0,0 +1,33 @@ +from hls4ml.backends.fpga.fpga_types import VariableDefinition +from hls4ml.backends.oneapi.oneapi_types import AggregratedArrayVariableConverter + + +# region InterfaceMemberVariable +class OneAPIAcceleratorInterfaceVariableDefinition(VariableDefinition): + def definition_cpp(self, name_suffix='', as_reference=False): + if self.pragma and not isinstance(self.pragma, tuple): + return f'[[{self.pragma}]] {self.type.name} {self.name}{name_suffix}' + else: + return f'{self.type.name} {self.name}{name_suffix}' + + # Updated pipe min size to be 32 for simulation. + def declare_cpp(self, pipe_min_size=32, indent=''): + # Updated to use streaming beat for restartable streaming kernel. + # Streaming beat is a wrapper type of the actual type with sideband control signals. + # Syntax: using BeatT = sycl::ext::intel::experimental::StreamingBeat; + streaming_beat_t = f"{self.pipe_name}BeatT" + lines = ( + f"{indent}class {self.pipe_id};\n" + f"{indent}using {streaming_beat_t} = " + f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n" + f"{indent}using {self.pipe_name} = sycl::ext::intel::experimental::pipe<" + f"{self.pipe_id}, {streaming_beat_t}, {pipe_min_size}, HostPipePropertiesT>;\n" + ) + return lines + + +class OneAPIAcceleratorInterfaceVariableConverter(AggregratedArrayVariableConverter): + def __init__(self, type_converter): + super().__init__( + type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIAcceleratorInterfaceVariableDefinition + ) diff --git a/hls4ml/backends/oneapi_accelerator/passes/__init__.py b/hls4ml/backends/oneapi_accelerator/passes/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hls4ml/backends/oneapi_accelerator/passes/transform_types.py b/hls4ml/backends/oneapi_accelerator/passes/transform_types.py new file mode 100644 index 0000000000..87da9486e4 --- /dev/null +++ b/hls4ml/backends/oneapi_accelerator/passes/transform_types.py @@ -0,0 +1,60 @@ +from hls4ml.backends.oneapi.oneapi_types import ( + OneAPIACTypeConverter, + OneAPIArrayVariableConverter, + OneAPIHLSTypeConverter, + OneAPIInplaceArrayVariableConverter, + OneAPIInplaceStreamVariableConverter, + OneAPIStaticWeightVariableConverter, + OneAPIStreamVariableConverter, +) +from hls4ml.backends.oneapi_accelerator.oneapi_accelerator_types import ( + OneAPIAcceleratorInterfaceVariableConverter, +) +from hls4ml.model.optimizer import GlobalOptimizerPass +from hls4ml.model.types import InplaceTensorVariable + + +class TransformTypes(GlobalOptimizerPass): + def __init__(self): + self.type_converter = OneAPIHLSTypeConverter(precision_converter=OneAPIACTypeConverter()) + self.array_var_converter = OneAPIArrayVariableConverter(type_converter=self.type_converter) + self.inplace_array_var_converter = OneAPIInplaceArrayVariableConverter(type_converter=self.type_converter) + self.interface_var_converter = OneAPIAcceleratorInterfaceVariableConverter(type_converter=self.type_converter) + self.stream_var_converter = OneAPIStreamVariableConverter(type_converter=self.type_converter) + self.inplace_stream_var_converter = OneAPIInplaceStreamVariableConverter(type_converter=self.type_converter) + self.weight_var_converter = OneAPIStaticWeightVariableConverter(type_converter=self.type_converter) + + def transform(self, model, node): + io_type = node.model.config.get_config_value('IOType') + + for out_name, var in node.variables.items(): + if io_type == 'io_stream': + if out_name in node.model.inputs: + new_var = self.interface_var_converter.convert(var, pragma='stream') + elif out_name in node.model.outputs: + new_var = self.interface_var_converter.convert(var, pragma='stream') + elif isinstance(var, InplaceTensorVariable): + new_var = self.inplace_stream_var_converter.convert(var, pragma='stream') + else: + new_var = self.stream_var_converter.convert(var, pragma='stream') + elif io_type == 'io_parallel': + if out_name in node.model.inputs: + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register') + elif out_name in node.model.outputs: + new_var = self.interface_var_converter.convert(var, pragma='intel::fpga_register') + elif isinstance(var, InplaceTensorVariable): + new_var = self.inplace_array_var_converter.convert(var, pragma='') + else: + new_var = self.array_var_converter.convert(var, pragma='intel::fpga_register') + else: + raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})') + + node.set_attr(out_name, new_var) + + for w_name, weight in node.weights.items(): + new_weight = self.weight_var_converter.convert(weight) + node.set_attr(w_name, new_weight) + + for t_name, type in node.types.items(): + new_type = self.type_converter.convert(type) + node.set_attr(t_name, new_type) From f5fcc0af5fec8ad79e92d0d0460c9e54a85e0516 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 15 Aug 2025 15:55:43 -0500 Subject: [PATCH 04/14] update some of the oneapi accelerator backend setup --- hls4ml/backends/__init__.py | 2 ++ .../oneapi_accelerator_backend.py | 16 +++++++++++++--- .../oneapi_accelerator_types.py | 4 +++- hls4ml/writer/__init__.py | 2 ++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py index 4a48f072cd..2e95491485 100644 --- a/hls4ml/backends/__init__.py +++ b/hls4ml/backends/__init__.py @@ -1,6 +1,7 @@ from hls4ml.backends.backend import Backend, get_available_backends, get_backend, register_backend # noqa: F401 from hls4ml.backends.fpga.fpga_backend import FPGABackend # noqa: F401 from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend +from hls4ml.backends.oneapi_accelerator.oneapi_accelerator_backend import OneAPIAcceleratorBackend from hls4ml.backends.quartus.quartus_backend import QuartusBackend from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend from hls4ml.backends.vivado.vivado_backend import VivadoBackend @@ -18,3 +19,4 @@ register_backend('Catapult', CatapultBackend) register_backend('SymbolicExpression', SymbolicExpressionBackend) register_backend('oneAPI', OneAPIBackend) +register_backend('oneAPIAccelerator', OneAPIAcceleratorBackend) # Can only be registered after oneAPI diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py index 768cf3876f..1ac13c8e27 100644 --- a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py @@ -8,13 +8,23 @@ class OneAPIAcceleratorBackend(OneAPIBackend): """ def __init__(self): - super().__init__(name='OneAPIAccelerator') + super().__init__(name='oneAPIAccelerator') def _register_flows(self): writer_passes = ['make_stamp', 'oneapiaccelerator:write_hls'] self._writer_flow = register_flow('write', writer_passes, requires=['oneapi:ip'], backend=self.name) - ip_flow_requirements = get_flow('oneapi:ip').requires.copy() + oneapi_types = [ + 'oneapiaccelerator:transform_types', + 'oneapi:register_bram_weights', + 'oneapi:apply_resource_strategy', + 'oneapi:apply_winograd_kernel_transformation', + ] + oneapi_types_flow = register_flow('specific_types', oneapi_types, requires=['oneapi:init_layers'], backend=self.name) + + ip_flow_requirements = [ + oneapi_types_flow if opt == 'oneapi:specific_types' else opt for opt in get_flow('oneapi:ip').requires + ] self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) def create_initial_config( @@ -23,7 +33,7 @@ def create_initial_config( """Create initial configuration of the oneAPI backend. Args: - part (str): The path to the board support file to be used. + part (str): The path to the board support package to be used. Can add : clock_period (int, optional): The clock period in ns. Defaults to 5. hyperopt_handshake (bool, optional): Should hyper-optimized handshaking be used? Defaults to False io_type (str, optional): Type of implementation used. One of diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_types.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_types.py index b96fe93c5c..27ce510f0c 100644 --- a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_types.py +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_types.py @@ -29,5 +29,7 @@ def declare_cpp(self, pipe_min_size=32, indent=''): class OneAPIAcceleratorInterfaceVariableConverter(AggregratedArrayVariableConverter): def __init__(self, type_converter): super().__init__( - type_converter=type_converter, prefix='OneAPI', definition_cls=OneAPIAcceleratorInterfaceVariableDefinition + type_converter=type_converter, + prefix='OneAPIAccelerator', + definition_cls=OneAPIAcceleratorInterfaceVariableDefinition, ) diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py index 8de19fe1d2..b8b066f036 100644 --- a/hls4ml/writer/__init__.py +++ b/hls4ml/writer/__init__.py @@ -1,4 +1,5 @@ from hls4ml.writer.catapult_writer import CatapultWriter +from hls4ml.writer.oneapi_accelerator_writer import OneAPIAcceleratorWriter from hls4ml.writer.oneapi_writer import OneAPIWriter from hls4ml.writer.quartus_writer import QuartusWriter from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter @@ -12,5 +13,6 @@ register_writer('Vitis', VitisWriter) register_writer('Quartus', QuartusWriter) register_writer('oneAPI', OneAPIWriter) +register_writer('oneAPIAccelerator', OneAPIAcceleratorWriter) register_writer('Catapult', CatapultWriter) register_writer('SymbolicExpression', SymbolicExpressionWriter) From 944e6300b2592b67b20b19ffa7cb27f43f97b9a9 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 15 Aug 2025 16:15:52 -0500 Subject: [PATCH 05/14] try using DMA also for bridge --- hls4ml/writer/oneapi_accelerator_writer.py | 85 ++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/hls4ml/writer/oneapi_accelerator_writer.py b/hls4ml/writer/oneapi_accelerator_writer.py index ee836cc516..dd6ca6d065 100644 --- a/hls4ml/writer/oneapi_accelerator_writer.py +++ b/hls4ml/writer/oneapi_accelerator_writer.py @@ -283,6 +283,91 @@ def write_test_bench(self, model): fout.write(newline) + def write_bridge(self, model): + """Write the Python-C++ bridge (myproject_bridge.cpp) + + Args: + model (ModelGraph): the hls4ml model. + """ + project_name = model.config.get_project_name() + stamp = model.config.get_config_value('Stamp') + model_inputs = model.get_input_variables() + model_outputs = model.get_output_variables() + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + # model brambs aren't actually supported yet + + # io_type = model.config.get_config_value('IOType') + indent = ' ' + + filedir = os.path.dirname(os.path.abspath(__file__)) + with ( + open(os.path.join(filedir, '../templates/oneapi/myproject_bridge.cpp')) as f, + open(f'{model.config.get_output_dir()}/src/{project_name}_bridge.cpp', 'w') as fout, + ): + for line in f.readlines(): + if 'MYPROJECT' in line: + newline = line.replace('MYPROJECT', format(project_name.upper())) + + elif 'myproject' in line: + newline = line.replace('myproject', format(project_name)) + + elif 'MyProject' in line: + newline = line.replace('MyProject', convert_to_pascal_case(project_name)) + + elif '// hls-fpga-machine-learning insert bram' in line: + newline = line + for bram in model_brams: + newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + + elif '// hls-fpga-machine-learning insert class def' in line: + dtype = line.split('#', 1)[1].strip() + newline = f'class {convert_to_pascal_case(project_name)}Class{dtype.capitalize()}_{stamp};\n' + + elif '// hls-fpga-machine-learning insert header' in line: + dtype = line.split('#', 1)[1].strip() + inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs]) + outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs]) + + newline = '' + newline += indent + inputs_str + ',\n' + newline += indent + outputs_str + '\n' + + elif '// hls-fpga-machine-learning insert wrapper' in line: + dtype = line.split('#', 1)[1].strip() + newline = '' + for i in model_inputs: + newline += indent + f'nnet::DMA_convert_data<{dtype}, {i.pipe_name}, {i.size_cpp()}>(q, {i.name});\n' + + newline += ( + indent + + f'q.single_task<{convert_to_pascal_case(project_name)}Class{dtype.capitalize()}_{stamp}>' + + f'({convert_to_pascal_case(project_name)}{{}});\n' + ) + + for o in model_outputs: + newline += ( + indent + f'nnet::DMA_convert_data_back<{o.pipe_name}, {dtype}, {o.size_cpp()}>(q, {o.name});\n' + ) + newline += '\n' + newline += indent + 'q.wait();\n' + + elif '// hls-fpga-machine-learning insert trace_outputs' in line: + newline = '' + for layer in model.get_layers(): + func = layer.get_attr('function_cpp') + if func and model.config.trace_output and layer.get_attr('trace', False): + vars = layer.get_variables() + for var in vars: + newline += ( + indent + + 'nnet::trace_outputs->insert(std::pair(' + + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n' + ) + + else: + newline = line + fout.write(newline) + def write_build_script(self, model): """Write the build scripts (Makefile, build_lib.sh) From ca6c3156a26a41c5afc3c10ae327af56ff335a31 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Wed, 20 Aug 2025 18:08:06 -0500 Subject: [PATCH 06/14] setup predict for easier hardware acceleration --- .../firmware/nnet_utils/nnet_data_movement.h | 53 ++++++++++++++++++- hls4ml/writer/oneapi_accelerator_writer.py | 4 +- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h index 58e9f5e240..93162b6b7c 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h @@ -11,7 +11,7 @@ namespace nnet { ////////////////////////////////////////////////////////////////////////////// -// These are the simple, testbench-only versions for the HLS flow +// These are the simple, testbench and bridge versions for the HLS flow ////////////////////////////////////////////////////////////////////////////// template void convert_data(sycl::queue &q, srcType *src) { constexpr auto dstTypeSize = std::tuple_size::value_type>{}; @@ -137,6 +137,57 @@ template struct DMA_convert_data_back { } }; +////////////////////////////////////////////////////////////////////////////// +// These are versions to convert data for the accelerator bridge (using BSP) +////////////////////////////////////////////////////////////////////////////// +template void DMA_bridge_convert_data(sycl::queue &q, srcType *src) { + // First, extract the PipeDataT from the pipe + using PipeDataType = typename nnet::ExtractPipeType::value_type; + // Then, extract the DataT from StreamingBeat + using DstDataType = typename nnet::ExtractDataType::value_type; + constexpr auto dstTypeSize = std::tuple_size{}; + + constexpr size_t num_iterations = SIZE / dstTypeSize; + + // Allocate host memory + srcType *vals = sycl::malloc_host(SIZE, q); + if (vals == nullptr) { + std::cerr << "ERROR: host allocation failed for input\n"; + return; + } + // copy to host memory + for (size_t i = 0; i < SIZE; i++) { + vals[i] = src[i]; + } + q.single_task(DMA_convert_data{vals, num_iterations}); +} + +template void DMA_bridge_convert_data_back(sycl::queue &q, dstType *dst) { + // First, extract the PipeDataT from the pipe + using PipeDataType = typename nnet::ExtractPipeType::value_type; + // Then, extract the DataT from StreamingBeat + using SrcDataType = typename nnet::ExtractDataType::value_type; + constexpr auto srcTypeSize = std::tuple_size{}; + + constexpr size_t num_iterations = SIZE / srcTypeSize; + + // Allocate host memory + dstType *outputs = sycl::malloc_host(SIZE, q); + if (outputs == nullptr) { + std::cerr << "ERROR: host allocation failed for output\n"; + return; + } + + q.single_task(DMA_convert_data_back{outputs, num_iterations}).wait(); + + // copy the data back + for (size_t j = 0; j < SIZE; j++) { + dst[j] = outputs[j]; + } +} + + + } // namespace nnet #endif diff --git a/hls4ml/writer/oneapi_accelerator_writer.py b/hls4ml/writer/oneapi_accelerator_writer.py index dd6ca6d065..5c75b729c5 100644 --- a/hls4ml/writer/oneapi_accelerator_writer.py +++ b/hls4ml/writer/oneapi_accelerator_writer.py @@ -336,7 +336,7 @@ def write_bridge(self, model): dtype = line.split('#', 1)[1].strip() newline = '' for i in model_inputs: - newline += indent + f'nnet::DMA_convert_data<{dtype}, {i.pipe_name}, {i.size_cpp()}>(q, {i.name});\n' + newline += indent + f'nnet::DMA_bridge_convert_data<{dtype}, {i.pipe_name}, {i.size_cpp()}>(q, {i.name});\n' newline += ( indent @@ -346,7 +346,7 @@ def write_bridge(self, model): for o in model_outputs: newline += ( - indent + f'nnet::DMA_convert_data_back<{o.pipe_name}, {dtype}, {o.size_cpp()}>(q, {o.name});\n' + indent + f'nnet::DMA_bridge_convert_data_back<{o.pipe_name}, {dtype}, {o.size_cpp()}>(q, {o.name});\n' ) newline += '\n' newline += indent + 'q.wait();\n' From 49acece955138f0cb216a6d5bf6787760f26b9b6 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Thu, 21 Aug 2025 18:07:36 -0500 Subject: [PATCH 07/14] snapshot of trying to handle streaming in onaAPI accelerator --- .../oneapi_accelerator_layers.py | 24 +++++++ .../firmware/nnet_utils/nnet_dense_stream.h | 4 +- .../firmware/nnet_utils/nnet_stream_beat.h | 71 +++++++++++++++++++ 3 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py create mode 100644 hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py new file mode 100644 index 0000000000..da77c2bca0 --- /dev/null +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py @@ -0,0 +1,24 @@ +import numpy as np + +from hls4ml.model.attributes import Attribute, ConfigurableAttribute, TypeAttribute +from hls4ml.model.layers import Layer +from hls4ml.model.types import IntegerPrecisionType + + +class ExtractSideband(Layer): + '''This layer extract the sideband and sends it to a different strem + ''' + + SIDEBAND_SHAPE = 2 + + def initialize(self): + inp = self.get_input_variable() + + # I think the order of these must be as stated because they each set the result_t type. + # We want the second one to be the actual result_t. + self.add_output_variable(SIDEBAND_SHAPE, + out_name='sideband', + var_name='sideband_out', + type_name='sideband_t', + precision=IntegerPrecisionType(1, False)) + self.add_output_variable(inp.shape, precision=inp.precision) diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h index 92c9adc3bb..f162b4736c 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -7,7 +7,9 @@ namespace nnet { -// Note: DataPack logic removed, at least in the initial version +// Note: DataPack logic removed, at least in the initial version. +// The data should be sent to the dense layer in parallel, in one stream transaction. +// Note that this means flatten is not a noop in oneAPI streaming. template void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) { diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h new file mode 100644 index 0000000000..5ebe98b87d --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h @@ -0,0 +1,71 @@ +#ifndef NNET_STREAM_BEAT_H +#define NNET_STREAM_BEAT_H + +// These are functions just for streaming in accelerator mode. They convert from using packets +// to not using packets, and visa versa. + +// ************************************************* +// Remove sideband and passes it to end via skip pipe +// ************************************************* +template [[intel::use_stall_enable_clusters]] void remove_sideband_stream() { + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool sop = false; + bool eop = false; + +LinearActLoop: + [[intel::initiation_interval(1)]] while (!eop) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { + auto in_data = data_pipe::read(); + + LinearPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size::value_type>{}; j++) { + out_data[j] = in_data.data[j]; + } + + res_pipe::write(out_data); + + if (i == 0) { + sop = in_data.sop; + } + eop = in_data.eop; + } + typename ExtractPipeType::value_type skip_data; // this is a two-element array, {sop, eop}. + skip_data[0] = sop; + skip_data[1] = eop; + skip_pipe::write(skip_data); + } +} + +// ************************************************* +// Recieves sideband via skip pipe, and makees it sideband +// ************************************************* + +template [[intel::use_stall_enable_clusters]] void add_sideband_stream() { + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + constexpr auto num_transfers = CONFIG_T::n_in / std::tuple_size{}; + + auto skip_data = skip_pipe::read(); + + +LinearActLoop: + [[intel::initiation_interval(1)]] for (int i = 0; i < num_transfers; i++) { + auto in_data = data_pipe::read(); + + LinearPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + out_data.data[j] = in_data.data[j]; + } + out_data.sop = (i == 0) ? skip_data[0] : false; + out_data.eop = (i == num_transfers-1) ? skip_data[1] : false; + res_pipe::write(out_data); + } +} + + +#endif \ No newline at end of file From f2118b8a61df12d7438850e44c3fc1025456f907 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 1 Aug 2025 22:27:24 -0500 Subject: [PATCH 08/14] change some things that assume name == output[0], and reoder so that the names aren't changed in between --- hls4ml/model/graph.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index e3c293dd46..1e2943606b 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -693,6 +693,11 @@ def replace_node(self, old_node, new_node): repl = {old_name: new_name for old_name, new_name in zip(old_node.outputs, new_node.outputs)} repl.update({old_name: new_name for old_name, new_name in zip(old_node.inputs, new_node.inputs)}) + for old_output in old_node.outputs: + if old_output in self.outputs: + new_output = repl[old_output] + self.outputs = [new_output if name == old_output else name for name in self.outputs] + for node in self.graph.values(): for i, n in enumerate(node.inputs): if n in repl: @@ -703,10 +708,7 @@ def replace_node(self, old_node, new_node): self.graph = OrderedDict((new_node.name, new_node) if k == old_node.name else (k, v) for k, v in self.graph.items()) - old_name = old_node.name - if old_name in self.outputs: - new_name = new_node.name - self.outputs = [new_name if name == old_name else name for name in self.outputs] + def split_node(self, old_node, new_node1, new_node2): """Replace an existing node in the graph with two nodes in sequence. @@ -728,6 +730,11 @@ def split_node(self, old_node, new_node1, new_node2): repl = {old_name: new_name for old_name, new_name in zip(old_node.outputs, new_node2.outputs)} repl.update({old_name: new_name for old_name, new_name in zip(old_node.inputs, new_node1.inputs)}) + for old_output in old_node.outputs: + if old_output in self.outputs: + new_output = repl[old_output] + self.outputs = [new_output if name == old_output else name for name in self.outputs] + for node in self.graph.values(): for i, n in enumerate(node.inputs): if n in repl: @@ -745,8 +752,6 @@ def split_node(self, old_node, new_node1, new_node2): new_graph[key] = value self.graph = new_graph - if old_node.name in self.outputs: - self.outputs = [new_node2.name if name == old_node.name else name for name in self.outputs] def next_layer(self): self.index += 1 From 5442b944f06135de7968c283bff031f8aab9c30d Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 22 Aug 2025 18:57:27 -0500 Subject: [PATCH 09/14] add optimizers to insert sideband layers --- .../oneapi_accelerator_backend.py | 23 +++++- .../oneapi_accelerator_layers.py | 39 +++++---- .../oneapi_accelerator/passes/sidebands.py | 82 +++++++++++++++++++ hls4ml/model/graph.py | 39 +++++---- 4 files changed, 149 insertions(+), 34 deletions(-) create mode 100644 hls4ml/backends/oneapi_accelerator/passes/sidebands.py diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py index 1ac13c8e27..8036d21e93 100644 --- a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py @@ -1,5 +1,5 @@ from hls4ml.backends import OneAPIBackend -from hls4ml.model.flow import get_flow, register_flow +from hls4ml.model.flow import register_flow class OneAPIAcceleratorBackend(OneAPIBackend): @@ -22,10 +22,25 @@ def _register_flows(self): ] oneapi_types_flow = register_flow('specific_types', oneapi_types, requires=['oneapi:init_layers'], backend=self.name) - ip_flow_requirements = [ - oneapi_types_flow if opt == 'oneapi:specific_types' else opt for opt in get_flow('oneapi:ip').requires + streaming_passes = [ + 'oneapi:clone_output', + 'oneapiaccelerator:extract_sideband', + 'oneapiaccelerator:merge_sideband', ] - self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) + streaming_flow = register_flow('streaming', streaming_passes, requires=['oneapi:init_layers'], backend=self.name) + + accel_flow_requirements = [ + 'optimize', + 'oneapi:init_layers', + streaming_flow, + 'oneapi:quantization', + 'oneapi:optimize', + oneapi_types_flow, + 'oneapi:apply_templates', + ] + + accel_flow_requirements = list(filter(None, accel_flow_requirements)) + self._default_flow = register_flow('accel', None, requires=accel_flow_requirements, backend=self.name) def create_initial_config( self, part, clock_period=5, hyperopt_handshake=False, io_type='io_parallel', write_tar=False, **_ diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py index da77c2bca0..d7416c7b39 100644 --- a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py @@ -1,24 +1,35 @@ -import numpy as np - -from hls4ml.model.attributes import Attribute, ConfigurableAttribute, TypeAttribute -from hls4ml.model.layers import Layer +from hls4ml.model.layers import Layer, register_layer from hls4ml.model.types import IntegerPrecisionType +SIDEBAND_SHAPE = 2 -class ExtractSideband(Layer): - '''This layer extract the sideband and sends it to a different strem - ''' - SIDEBAND_SHAPE = 2 +class SidebandExtraction(Layer): + """This layer extract the sideband and sends it to a different strem""" def initialize(self): inp = self.get_input_variable() # I think the order of these must be as stated because they each set the result_t type. # We want the second one to be the actual result_t. - self.add_output_variable(SIDEBAND_SHAPE, - out_name='sideband', - var_name='sideband_out', - type_name='sideband_t', - precision=IntegerPrecisionType(1, False)) - self.add_output_variable(inp.shape, precision=inp.precision) + self.add_output_variable( + SIDEBAND_SHAPE, + out_name='sideband', + var_name='sideband_out', + type_name='sideband_t', + precision=IntegerPrecisionType(1, False), + ) + self.add_output_variable(inp.shape, precision=inp.type.precision) + + +class SidebandMerging(Layer): + """This layer gets the sideband from a different input and merges it""" + + def initialize(self): + inp = self.get_input_variable() + self.add_output_variable(inp.shape, precision=inp.type.precision) + + +# register the layers +register_layer('SidebandExtraction', SidebandExtraction) +register_layer('SidebandMerging', SidebandMerging) diff --git a/hls4ml/backends/oneapi_accelerator/passes/sidebands.py b/hls4ml/backends/oneapi_accelerator/passes/sidebands.py new file mode 100644 index 0000000000..953eb4d588 --- /dev/null +++ b/hls4ml/backends/oneapi_accelerator/passes/sidebands.py @@ -0,0 +1,82 @@ +""" +This file contains optimizers to add layers to extract and merge the sidebands. This is useful +for the accelerator flow when using io_stream. + +Warning: current version only works for network with single inputs and outputs. + +""" + +import warnings +from collections import OrderedDict + +from hls4ml.backends.oneapi_accelerator.oneapi_accelerator_layers import SidebandExtraction, SidebandMerging +from hls4ml.model.layers import Input +from hls4ml.model.optimizer import OptimizerPass + + +class ExtractSideband(OptimizerPass): + """Add a layer after the input to extract the sideband signals.""" + + def match(self, node): + if not (isinstance(node, Input) and node.model.config.get_config_value('IOType') == 'io_stream'): + return False + # now check that not already converted + output_nodes = node.get_output_nodes() + if len(output_nodes) == 1 and isinstance(output_nodes[0], SidebandExtraction): + # already transformed + return False + return True + + def transform(self, model, node): + if len(model.inputs) > 1: + warnings.warn('Current sideband extraction scheme only tested on models with one input', stacklevel=1) + + attributes = {'input_shape': node.get_attr('input_shape')} + new_node = model.make_node( + SidebandExtraction, + f'{node.name}_extract_sb', + attributes, + inputs=[node.outputs[0]], + outputs=[f'{node.name}_extract_sb', 'sideband'], + ) + model.insert_node(new_node) + return True + + +class MergeSideband(OptimizerPass): + """Add a layer after the last layer to merge the sideband signals.""" + + def match(self, node): + for node_out in node.outputs: + if node_out in node.model.outputs: # if the node output is a model output + return True + return False + + def transform(self, model, node): + if len(model.outputs) > 1: + warnings.warn('Current sideband extraction scheme only tested on models with one output', stacklevel=1) + + attributes = {} + + inputs = [out for out in node.outputs if out in model.outputs] + + if len(inputs) != 1: + raise RuntimeError('Unsupported number of outputs found') + + inputs.append('sideband') + + new_name = f'{node.name}_merge_sb' + new_node = model.make_node(SidebandMerging, new_name, attributes, inputs=inputs) + + # note that model.insert_node fails here because of the two input nodes, so using a custom version below + model.outputs[0] = new_name + + new_graph = OrderedDict() + for k, v in model.graph.items(): + new_graph[k] = v + if k == node.name: + new_graph[new_node.name] = new_node + + model.graph = new_graph + + return True diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index 1e2943606b..dc636d39d4 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -577,7 +577,7 @@ def make_node(self, kind, name, attributes, inputs, outputs=None, initialize=Tru self.output_vars[o] = out_var return node - def insert_node(self, node, before=None, input_idx=0): + def insert_node(self, node, before=None, input_idx=-1): """Insert a new node into the model graph. The node to be inserted should be created with `make_node()` function. The optional @@ -587,7 +587,8 @@ def insert_node(self, node, before=None, input_idx=0): node (Layer): Node to insert before (Layer, optional): The next node in sequence before which a new node should be inserted. - input_idx (int, optional): If the next node takes multiple inputs, the input index + input_idx (int, optional): If the next node takes multiple inputs, the input index; + The default (-1) means match by name Raises: Exception: If an attempt to insert a node with multiple inputs is made or if `before` does not specify a correct node in sequence. @@ -603,19 +604,28 @@ def insert_node(self, node, before=None, input_idx=0): if overlap: next_nodes.append(x) - if before is None: - next_node = next((x for x in self.graph.values() if x.inputs and x.inputs[0] in prev_node.outputs), None) - else: - if before not in next_nodes: - raise Exception( - 'Cannot insert a node {} before {} (candidates: {}).'.format( - node.name, before.name, ','.join([n.name for n in next_nodes]) + if before is not None: + if not isinstance(before, (tuple, list)): + before = [before] + + # check that before is in next_nodes + for bf in before: + if bf not in next_nodes: + raise RuntimeError( + 'Cannot insert a node {} before {} (candidates: {}).'.format( + node.name, before.name, ','.join([n.name for n in next_nodes]) + ) ) - ) - next_node = before + # only put before as next_nodes + next_nodes = before - if next_node is not None: - next_node.inputs[input_idx] = node.outputs[0] + if next_nodes: + repl = {old_name: new_name for old_name, new_name in zip(prev_node.outputs, node.outputs)} + for next_node in next_nodes: + if input_idx >= 0: + next_node.inputs[input_idx] = node.outputs[0] + else: + next_node.inputs = [repl[val] if val in repl else val for val in next_node.inputs] else: self.outputs = [node.outputs[0] if name == prev_node.outputs[0] else name for name in self.outputs] @@ -708,8 +718,6 @@ def replace_node(self, old_node, new_node): self.graph = OrderedDict((new_node.name, new_node) if k == old_node.name else (k, v) for k, v in self.graph.items()) - - def split_node(self, old_node, new_node1, new_node2): """Replace an existing node in the graph with two nodes in sequence. @@ -752,7 +760,6 @@ def split_node(self, old_node, new_node1, new_node2): new_graph[key] = value self.graph = new_graph - def next_layer(self): self.index += 1 return self.index From b4e8da3067e5b8a27913ac8a58816fb59ccb3f8e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Fri, 22 Aug 2025 20:36:09 -0500 Subject: [PATCH 10/14] try to setup sideband templates --- .../oneapi_accelerator_layers.py | 11 +++ .../passes/sideband_templates.py | 71 +++++++++++++++++++ .../oneapi_accelerator/passes/sidebands.py | 7 +- .../firmware/nnet_utils/nnet_stream_beat.h | 16 +++-- 4 files changed, 96 insertions(+), 9 deletions(-) create mode 100644 hls4ml/backends/oneapi_accelerator/passes/sideband_templates.py diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py index d7416c7b39..ceb5adbcd3 100644 --- a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py @@ -1,3 +1,4 @@ +from hls4ml.model.attributes import Attribute from hls4ml.model.layers import Layer, register_layer from hls4ml.model.types import IntegerPrecisionType @@ -7,8 +8,13 @@ class SidebandExtraction(Layer): """This layer extract the sideband and sends it to a different strem""" + _expected_attributes = [ + Attribute('n_in'), + ] + def initialize(self): inp = self.get_input_variable() + self.set_attr('n_in', inp.size()) # I think the order of these must be as stated because they each set the result_t type. # We want the second one to be the actual result_t. @@ -25,8 +31,13 @@ def initialize(self): class SidebandMerging(Layer): """This layer gets the sideband from a different input and merges it""" + _expected_attributes = [ + Attribute('n_in'), + ] + def initialize(self): inp = self.get_input_variable() + self.set_attr('n_in', inp.size()) self.add_output_variable(inp.shape, precision=inp.type.precision) diff --git a/hls4ml/backends/oneapi_accelerator/passes/sideband_templates.py b/hls4ml/backends/oneapi_accelerator/passes/sideband_templates.py new file mode 100644 index 0000000000..890141be09 --- /dev/null +++ b/hls4ml/backends/oneapi_accelerator/passes/sideband_templates.py @@ -0,0 +1,71 @@ +"""The sideband handling templates are needed for oneAPI accelerator when using io_stream. +They are not used in io_paralle. +""" + +from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate +from hls4ml.backends.oneapi_accelerator.oneapi_accelerator_layers import SidebandExtraction, SidebandMerging +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate + +sideband_config_template = """struct config{index} : nnet::sideband_config {{ + static constexpr unsigned n_in = {n_in}; +}};\n""" +sideband_stream_function_template = '{name}.async();' +sideband_extract_task_sequence_template = ( + 'task_sequence> {name};' +) +sideband_merge_task_sequence_template = ( + 'task_sequence> {name};' +) +sideband_include_list = ['nnet_utils/nnet_stream_beat.h'] + + +class SidebandConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__((SidebandExtraction, SidebandMerging)) + self.template = sideband_config_template + + def format(self, node): + params = self._default_config_params(node) + return self.template.format(**params) + + +class SidebandFunctionTemplate(FunctionCallTemplate): + """Only used to add the include list""" + + def __init__(self): + super().__init__((SidebandExtraction, SidebandMerging), include_header=sideband_include_list) + + def format(self, node): + return '' + + +class SidebandStreamFunctionTemplate(StreamFunctionCallTemplate): + def __init__(self): + super().__init__((SidebandExtraction, SidebandMerging)) + self.template = sideband_stream_function_template + + def format(self, node): + params = self._default_function_params(node) + return self.template.format(**params) + + +class SidebandExtractionTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(SidebandExtraction) + self.template = sideband_extract_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['skip_pipe'] = node.get_output_variable(1).pipe_name + return self.template.format(**params) + + +class SidebandMergeTaskSequenceTemplate(TaskSequenceTemplate): + def __init__(self): + super().__init__(SidebandMerging) + self.template = sideband_merge_task_sequence_template + + def format(self, node): + params = self._default_function_params(node) + params['skip_pipe'] = node.get_input_variable(1).pipe_name + return self.template.format(**params) diff --git a/hls4ml/backends/oneapi_accelerator/passes/sidebands.py b/hls4ml/backends/oneapi_accelerator/passes/sidebands.py index 953eb4d588..0d4f646125 100644 --- a/hls4ml/backends/oneapi_accelerator/passes/sidebands.py +++ b/hls4ml/backends/oneapi_accelerator/passes/sidebands.py @@ -47,9 +47,10 @@ class MergeSideband(OptimizerPass): """Add a layer after the last layer to merge the sideband signals.""" def match(self, node): - for node_out in node.outputs: - if node_out in node.model.outputs: # if the node output is a model output - return True + if node.model.config.get_config_value('IOType') == 'io_stream': + for node_out in node.outputs: + if node_out in node.model.outputs: # if the node output is a model output + return True return False def transform(self, model, node): diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h index 5ebe98b87d..0ab2fe5435 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h @@ -4,10 +4,15 @@ // These are functions just for streaming in accelerator mode. They convert from using packets // to not using packets, and visa versa. +struct sideband_config { + static const unsigned n_in = 10; +}; + // ************************************************* // Remove sideband and passes it to end via skip pipe // ************************************************* -template [[intel::use_stall_enable_clusters]] void remove_sideband_stream() { +template +[[intel::use_stall_enable_clusters]] void extract_sideband_stream() { [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; @@ -43,7 +48,8 @@ template [ // Recieves sideband via skip pipe, and makees it sideband // ************************************************* -template [[intel::use_stall_enable_clusters]] void add_sideband_stream() { +template +[[intel::use_stall_enable_clusters]] void merge_sideband_stream() { using ResT = typename ExtractDataType::value_type>::value_type; [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; @@ -51,7 +57,6 @@ template [ auto skip_data = skip_pipe::read(); - LinearActLoop: [[intel::initiation_interval(1)]] for (int i = 0; i < num_transfers; i++) { auto in_data = data_pipe::read(); @@ -62,10 +67,9 @@ template [ out_data.data[j] = in_data.data[j]; } out_data.sop = (i == 0) ? skip_data[0] : false; - out_data.eop = (i == num_transfers-1) ? skip_data[1] : false; + out_data.eop = (i == num_transfers - 1) ? skip_data[1] : false; res_pipe::write(out_data); } } - -#endif \ No newline at end of file +#endif From 8e9cb0534eefaa2cf15f4abccd7770c3ff0ce815 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Sat, 23 Aug 2025 17:56:10 -0500 Subject: [PATCH 11/14] another snapshot fixing some bugs --- .../oneapi_accelerator/oneapi_accelerator_backend.py | 6 +++++- .../oneapi_accelerator/passes/sideband_templates.py | 4 ++-- hls4ml/backends/oneapi_accelerator/passes/sidebands.py | 2 +- .../templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h | 6 ++++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py index 8036d21e93..0c5a5f51c4 100644 --- a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_backend.py @@ -29,6 +29,10 @@ def _register_flows(self): ] streaming_flow = register_flow('streaming', streaming_passes, requires=['oneapi:init_layers'], backend=self.name) + template_flow = register_flow( + 'apply_templates', self._get_layer_templates, requires=['oneapi:init_layers'], backend=self.name + ) + accel_flow_requirements = [ 'optimize', 'oneapi:init_layers', @@ -36,7 +40,7 @@ def _register_flows(self): 'oneapi:quantization', 'oneapi:optimize', oneapi_types_flow, - 'oneapi:apply_templates', + template_flow, ] accel_flow_requirements = list(filter(None, accel_flow_requirements)) diff --git a/hls4ml/backends/oneapi_accelerator/passes/sideband_templates.py b/hls4ml/backends/oneapi_accelerator/passes/sideband_templates.py index 890141be09..a417db1c9f 100644 --- a/hls4ml/backends/oneapi_accelerator/passes/sideband_templates.py +++ b/hls4ml/backends/oneapi_accelerator/passes/sideband_templates.py @@ -56,7 +56,7 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) - params['skip_pipe'] = node.get_output_variable(1).pipe_name + params['skip_pipe'] = node.get_output_variable('sideband').pipe_name return self.template.format(**params) @@ -67,5 +67,5 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) - params['skip_pipe'] = node.get_input_variable(1).pipe_name + params['skip_pipe'] = node.get_input_variable('sideband').pipe_name return self.template.format(**params) diff --git a/hls4ml/backends/oneapi_accelerator/passes/sidebands.py b/hls4ml/backends/oneapi_accelerator/passes/sidebands.py index 0d4f646125..37ac4f8616 100644 --- a/hls4ml/backends/oneapi_accelerator/passes/sidebands.py +++ b/hls4ml/backends/oneapi_accelerator/passes/sidebands.py @@ -47,7 +47,7 @@ class MergeSideband(OptimizerPass): """Add a layer after the last layer to merge the sideband signals.""" def match(self, node): - if node.model.config.get_config_value('IOType') == 'io_stream': + if node.model.config.get_config_value('IOType') == 'io_stream' and not isinstance(node, SidebandMerging): for node_out in node.outputs: if node_out in node.model.outputs: # if the node output is a model output return True diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h index 0ab2fe5435..f1d5764b43 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h @@ -4,6 +4,8 @@ // These are functions just for streaming in accelerator mode. They convert from using packets // to not using packets, and visa versa. +namespace nnet { + struct sideband_config { static const unsigned n_in = 10; }; @@ -37,7 +39,7 @@ template } eop = in_data.eop; } - typename ExtractPipeType::value_type skip_data; // this is a two-element array, {sop, eop}. + typename nnet::ExtractPipeType::value_type skip_data; // this is a two-element array, {sop, eop}. skip_data[0] = sop; skip_data[1] = eop; skip_pipe::write(skip_data); @@ -71,5 +73,5 @@ template res_pipe::write(out_data); } } - +} // namespace nnet #endif From 91bef7b0e3969e800bcc4f9c385a0a9a69715d66 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Sun, 24 Aug 2025 19:59:59 -0500 Subject: [PATCH 12/14] various bug fixes --- hls4ml/backends/oneapi/oneapi_template.py | 4 ++-- .../oneapi_accelerator/oneapi_accelerator_layers.py | 7 +++---- .../backends/oneapi_accelerator/passes/transform_types.py | 2 ++ 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/hls4ml/backends/oneapi/oneapi_template.py b/hls4ml/backends/oneapi/oneapi_template.py index c86b8f7ea3..48668688e2 100644 --- a/hls4ml/backends/oneapi/oneapi_template.py +++ b/hls4ml/backends/oneapi/oneapi_template.py @@ -52,8 +52,8 @@ def _default_function_params(self, layer): params = self._default_params(layer) params['name'] = layer.name params['config'] = f'config{layer.index}' - params['input_pipe'] = layer.get_input_variable().pipe_name - params['output_pipe'] = layer.get_output_variable().pipe_name + params['input_pipe'] = layer.get_input_variable(layer.inputs[0]).pipe_name + params['output_pipe'] = layer.get_output_variable(layer.outputs[0]).pipe_name return params diff --git a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py index ceb5adbcd3..710ce94927 100644 --- a/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py +++ b/hls4ml/backends/oneapi_accelerator/oneapi_accelerator_layers.py @@ -1,4 +1,4 @@ -from hls4ml.model.attributes import Attribute +from hls4ml.model.attributes import Attribute, TypeAttribute from hls4ml.model.layers import Layer, register_layer from hls4ml.model.types import IntegerPrecisionType @@ -8,9 +8,7 @@ class SidebandExtraction(Layer): """This layer extract the sideband and sends it to a different strem""" - _expected_attributes = [ - Attribute('n_in'), - ] + _expected_attributes = [Attribute('n_in'), TypeAttribute('sideband_t', description='The type of the sidbands')] def initialize(self): inp = self.get_input_variable() @@ -25,6 +23,7 @@ def initialize(self): type_name='sideband_t', precision=IntegerPrecisionType(1, False), ) + self.set_attr('sideband_t', self.get_attr('sideband').type) # need to manually set this, unlike result_t self.add_output_variable(inp.shape, precision=inp.type.precision) diff --git a/hls4ml/backends/oneapi_accelerator/passes/transform_types.py b/hls4ml/backends/oneapi_accelerator/passes/transform_types.py index 87da9486e4..8947bacef9 100644 --- a/hls4ml/backends/oneapi_accelerator/passes/transform_types.py +++ b/hls4ml/backends/oneapi_accelerator/passes/transform_types.py @@ -50,6 +50,8 @@ def transform(self, model, node): raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})') node.set_attr(out_name, new_var) + if new_var.type.name in node.attributes: + node.set_attr(new_var.type.name, new_var.type) # this is for variables that are not result_t for w_name, weight in node.weights.items(): new_weight = self.weight_var_converter.convert(weight) From 7e14ddc0a50c121e7cf26919aa0e35c78cd39d5e Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Mon, 25 Aug 2025 11:25:21 -0500 Subject: [PATCH 13/14] fix some HLS bugs --- .../templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h index f1d5764b43..1b41f008c7 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_stream_beat.h @@ -66,10 +66,10 @@ template LinearPackLoop: #pragma unroll for (int j = 0; j < std::tuple_size{}; j++) { - out_data.data[j] = in_data.data[j]; + out_data.data[j] = in_data[j]; } - out_data.sop = (i == 0) ? skip_data[0] : false; - out_data.eop = (i == num_transfers - 1) ? skip_data[1] : false; + out_data.sop = (i == 0) ? static_cast(skip_data[0]) : false; + out_data.eop = (i == num_transfers - 1) ? static_cast(skip_data[1]) : false; res_pipe::write(out_data); } } From 25dcd93c0bb406e4efb6fe5130b0a91c29b3aee8 Mon Sep 17 00:00:00 2001 From: Jovan Mitrevski Date: Tue, 26 Aug 2025 16:38:09 -0500 Subject: [PATCH 14/14] pre-commit fixes --- .../oneapi/firmware/nnet_utils/nnet_data_movement.h | 2 -- hls4ml/writer/oneapi_accelerator_writer.py | 7 +++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h index 93162b6b7c..04395b2bda 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h @@ -186,8 +186,6 @@ template void DMA_bridge_convert_da } } - - } // namespace nnet #endif diff --git a/hls4ml/writer/oneapi_accelerator_writer.py b/hls4ml/writer/oneapi_accelerator_writer.py index 5c75b729c5..b5cdc3ed3b 100644 --- a/hls4ml/writer/oneapi_accelerator_writer.py +++ b/hls4ml/writer/oneapi_accelerator_writer.py @@ -336,7 +336,9 @@ def write_bridge(self, model): dtype = line.split('#', 1)[1].strip() newline = '' for i in model_inputs: - newline += indent + f'nnet::DMA_bridge_convert_data<{dtype}, {i.pipe_name}, {i.size_cpp()}>(q, {i.name});\n' + newline += ( + indent + f'nnet::DMA_bridge_convert_data<{dtype}, {i.pipe_name}, {i.size_cpp()}>(q, {i.name});\n' + ) newline += ( indent @@ -346,7 +348,8 @@ def write_bridge(self, model): for o in model_outputs: newline += ( - indent + f'nnet::DMA_bridge_convert_data_back<{o.pipe_name}, {dtype}, {o.size_cpp()}>(q, {o.name});\n' + indent + + f'nnet::DMA_bridge_convert_data_back<{o.pipe_name}, {dtype}, {o.size_cpp()}>(q, {o.name});\n' ) newline += '\n' newline += indent + 'q.wait();\n'