diff --git a/hls4ml/backends/vivado_accelerator/supported_boards.json b/hls4ml/backends/vivado_accelerator/supported_boards.json index 1279ec22d0..382ae8b1fd 100644 --- a/hls4ml/backends/vivado_accelerator/supported_boards.json +++ b/hls4ml/backends/vivado_accelerator/supported_boards.json @@ -38,5 +38,11 @@ "python_drivers": {"axi_stream": "axi_stream_driver.py"}, "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"}, "c_drivers": {} + }, + "ultra96v2": { + "part": "xczu3eg-sbva484-1-e", + "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl", "axi_master": "axi_master_design.tcl"}, + "python_drivers": {"axi_stream": "axi_master_driver.py"}, + "c_drivers": { "axi_master": "axi_master_design.c"} } } diff --git a/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py b/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py index f9c7848ef2..230bfb849e 100644 --- a/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py +++ b/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py @@ -128,14 +128,22 @@ def get_clock_period(self): def get_driver_path(self): if self.board.startswith('alveo'): return '../templates/vivado_accelerator/' + 'alveo/' + self.driver + '_drivers/' + \ - self.get_driver_file() + self.get_driver_files() else: return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + \ - self.get_driver_file() - - def get_driver_file(self): - driver_ext = '.py' if self.driver == 'python' else '.h' - return self.interface + '_driver' + driver_ext + self.get_driver_files() + + #def get_driver_file(self): + # driver_ext = '.py' if self.driver == 'python' else '.h' + # return self.interface + '_driver' + driver_ext + + def get_driver_files(self): + if self.driver == 'c': + driver_dir = 'sdk' + return driver_dir + elif self.driver == 'python': + driver_ext = '.py' + return self.interface + '_driver' + driver_ext def get_krnl_rtl_src_dir(self): return '../templates/vivado_accelerator/' + 'alveo/' + '/krnl_rtl_src' diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py index 8bcb832cef..39dd7a3f53 100644 --- a/hls4ml/model/graph.py +++ b/hls4ml/model/graph.py @@ -604,6 +604,11 @@ def compile(self): self._top_function_lib = ctypes.cdll.LoadLibrary(lib_name) def _get_top_function(self, x): + + io_type = self.config.get_config_value('IOType') + interface = self.config.get_config_value('AcceleratorConfig')['Interface'] if self.config.get_config_value('AcceleratorConfig') else None + config_weights = (io_type == 'io_stream') and (interface == 'axi_master') + if self._top_function_lib is None: raise Exception('Model not compiled') if len(self.get_input_variables()) == 1: @@ -611,7 +616,8 @@ def _get_top_function(self, x): else: xlist = x n_outputs = len(self.get_output_variables()) - + n_weights = len(self.get_weight_variables()) + for xi in xlist: if not isinstance(xi, np.ndarray): raise Exception('Expected numpy.ndarray, but got {}'.format(type(x))) @@ -628,9 +634,9 @@ def _get_top_function(self, x): else: raise Exception('Invalid type ({}) of numpy array. Supported types are: single, float32, double, float64, float_.'.format(x0.dtype)) - top_function.restype = None - top_function.argtypes = [npc.ndpointer(ctype, flags="C_CONTIGUOUS") for i in range(len(xlist) + n_outputs)] + top_function.argtypes = [npc.ndpointer(ctype, flags="C_CONTIGUOUS") \ + for i in range(len(xlist) + (n_weights if config_weights else 0) + n_outputs)] return top_function, ctype @@ -654,10 +660,16 @@ def _compute_n_samples(self, x): return int(n_sample) def predict(self, x): + + io_type = self.config.get_config_value('IOType') + interface = self.config.get_config_value('AcceleratorConfig')['Interface'] if self.config.get_config_value('AcceleratorConfig') else None + config_weights = (io_type == 'io_stream') and (interface == 'axi_master') + top_function, ctype = self._get_top_function(x) n_samples = self._compute_n_samples(x) n_inputs = len(self.get_input_variables()) n_outputs = len(self.get_output_variables()) + n_weights = len(self.get_weight_variables()) curr_dir = os.getcwd() os.chdir(self.config.get_output_dir() + '/firmware') @@ -675,10 +687,16 @@ def predict(self, x): inp = [np.asarray(xj[i]) for xj in x] argtuple = inp argtuple += predictions + if config_weights: + for j in range(n_weights): + weights = [float(w) for w in self.get_weight_variables()[j]] + argtuple += [np.asarray(weights)] argtuple = tuple(argtuple) top_function(*argtuple) - output.append(predictions) - + if config_weights and n_samples == 1 and n_inputs: + output.append([predictions]) + else: + output.append(predictions) # Convert to list of numpy arrays (one for each output) output = [np.asarray([output[i_sample][i_output] for i_sample in range(n_samples)]) for i_output in range(n_outputs)] diff --git a/hls4ml/model/optimizer/passes/nop.py b/hls4ml/model/optimizer/passes/nop.py index daf3e71fc4..fae9bbcfbc 100644 --- a/hls4ml/model/optimizer/passes/nop.py +++ b/hls4ml/model/optimizer/passes/nop.py @@ -6,7 +6,7 @@ def match(self, node): cast = False if isinstance(node, Activation): cast = node.get_input_variable().type.precision != node.get_output_variable().type.precision - return isinstance(node, Activation) and node.get_attr('activation') == 'linear' and not cast + return isinstance(node, Activation) and node.get_attr('activation') == 'linear' # and not cast def transform(self, model, node): model.remove_node(node) diff --git a/hls4ml/templates/vivado/myproject_test.cpp b/hls4ml/templates/vivado/myproject_test.cpp index 7de8dd4b4a..8a28926cd1 100644 --- a/hls4ml/templates/vivado/myproject_test.cpp +++ b/hls4ml/templates/vivado/myproject_test.cpp @@ -22,8 +22,9 @@ #include #include #include -#include -#include +#include +#include +#include #include "firmware/myproject.h" #include "firmware/nnet_utils/nnet_helpers.h" @@ -56,6 +57,10 @@ int main(int argc, char **argv) std::string pline; int e = 0; + //hls-fpga-machine-learning insert weights + + //hls-fpga-machine-learning insert load weights + if (fin.is_open() && fpr.is_open()) { while ( std::getline(fin,iline) && std::getline (fpr,pline) ) { if (e % CHECKPOINT == 0) std::cout << "Processing input " << e << std::endl; diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp index 7a06633e58..519a36194c 100644 --- a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp +++ b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp @@ -1,14 +1,17 @@ //hls-fpga-machine-learning insert include void myproject( - input_axi_t in[N_IN], - output_axi_t out[N_OUT] - ){ + input_axi_t in[N_IN] + , output_axi_t out[N_OUT] + //hls-fpga-machine-learning insert weights + ){ //hls-fpga-machine-learning insert interface //hls-fpga-machine-learning insert local vars + //hls-fpga-machine-learning insert enqueue weights + //hls-fpga-machine-learning insert enqueue //hls-fpga-machine-learning insert call diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.h b/hls4ml/templates/vivado_accelerator/myproject_axi.h index fe3dbc5cde..3d31fa0a8f 100644 --- a/hls4ml/templates/vivado_accelerator/myproject_axi.h +++ b/hls4ml/templates/vivado_accelerator/myproject_axi.h @@ -7,7 +7,8 @@ //hls-fpga-machine-learning insert definitions void myproject( - input_axi_t in[N_IN], - output_axi_t out[N_OUT] - ); + input_axi_t in[N_IN] + , output_axi_t out[N_OUT] + //hls-fpga-machine-learning insert weights + ); #endif diff --git a/hls4ml/templates/vivado_accelerator/standalone_main.c b/hls4ml/templates/vivado_accelerator/standalone_main.c new file mode 100644 index 0000000000..208985b1bf --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/standalone_main.c @@ -0,0 +1,269 @@ +/** + * + * Set Heap Size in ldscript.ld to 0x1000000 (16MB) + * + */ + +#include "xmyproject_axi.h" /* TODO: design-dependent name */ +#include "stdio.h" /* PRINTF */ +#include "unistd.h" /* sleep */ +#include "stdlib.h" +#include "malloc.h" +#include "assert.h" +#include "xil_io.h" /* peripheral read/write wrappers */ +#include "xtime_l.h" /* to measure performance of the system */ +#include "platform.h" /* platform init/cleanup functions */ +#include "xil_cache.h" /* enable/disable caches etc */ +#include "xil_printf.h" /* UART debug print functions */ +#include "xparameters.h" /* peripherals base addresses */ + +#include "data.h" + +//#define __DEBUG__ + +#define MAX_PRINT_ELEMENTS (16) + +#define PRINTF printf + +const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS; +const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS; + +#if 1 +/* Accelerator verification */ +#define REFERENCE_OUTPUTS data_y_hls_outputs +#else +/* Accelerator validation */ +#define REFERENCE_OUTPUTS data_y_outputs +//#define REFERENCE_OUTPUTS data_y_keras_outputs +#endif + +unsigned get_max(float *data, unsigned n_elements) { + float max_value = 0.0; + unsigned max_index = 0; + for (unsigned i = 0; i < n_elements; i++) + if (data[i] >= max_value) { + max_index = i; + max_value = data[i]; + } + return max_index; +} + +float *inputs_mem = NULL; +float *outputs_mem = NULL; +float *reference_mem = NULL; + +/* Accelerator configuration */ +XMyproject_axi accelerator; /* TODO: design-dependent name */ +XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */ + +/* Accelerator initialization routine */ +void init_accelerators() { + PRINTF("INFO: Initializing accelerator\r\n"); + accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_0_DEVICE_ID); /* TODO: design-dependent name */ + if (accelerator_cfg) { + int status = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */ + if (status != XST_SUCCESS) { + PRINTF("ERROR: Initializing accelerator\r\n"); + } + } +} + +/* Reference implementation of the accelerator in software */ +int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) { +#ifdef __DEBUG__ + PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n"); +#endif + /* See data.h for inputs and outputs */ + for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) { + sw_outputs_mem[i] = REFERENCE_OUTPUTS[i]; + } + return 0; +} + +/* Profiling function */ +double get_elapsed_time(XTime start, XTime stop) { + return 1.0 * (stop - start) / (COUNTS_PER_SECOND); +} + +/* Dump data to the console */ +void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) { + PRINTF("INFO: %s[%u][%u]:\r\n", label, n_samples, feature_count); + /* Print at most MAX_PRINT_ELEMENTS */ + for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) { + PRINTF("INFO: [%u] ", i); + for (unsigned j = 0; j < feature_count; j++) { + unsigned index = i * feature_count + j; + PRINTF("%f ", data[index]); + } + PRINTF("\r\n"); + } +} + +/* The top of the hill :-) */ +int main(int argc, char** argv) { + + XTime start, stop; + double calibration_time; + double sw_elapsed = 0; + double hw_elapsed = 0; + double cache_elapsed = 0; + unsigned hw_errors; + + char __attribute__ ((unused)) dummy; /* dummy input */ + + /* Initialize platform (uart and caches) */ + init_platform(); + + PRINTF("\r\n"); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */ + PRINTF("INFO: ==================================================\r\n"); + + init_accelerators(); + + inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float)); + outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + + /* Calibration */ + XTime_GetTime(&start); + sleep(1); + XTime_GetTime(&stop); + calibration_time = get_elapsed_time(start, stop); + PRINTF("INFO: Time calibration for one second (%lf sec)\r\n", calibration_time); + + /* Initialize memory */ + PRINTF("INFO: Initialize memory\r\n"); + PRINTF("INFO: - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */ + PRINTF("INFO: - Inputs count: %u\r\n", N_X_INPUTS); + PRINTF("INFO: - Outputs count: %u\r\n", N_Y_OUTPUTS); + PRINTF("INFO: - Data size: %u B\r\n", sizeof(float)); + PRINTF("INFO: - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + PRINTF("INFO: - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + + // Set Heap Size in ldscript.ld to 0x1000000 (16MB) + //malloc_stats(); + + for (int i = 0; i < INPUT_N_ELEMENTS; i++) { + inputs_mem[i] = data_X_inputs[i]; + } + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + outputs_mem[i] = 0x0; + } + + /* ****** SW REFERENCE ****** */ + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Start SW reference implementation\r\n"); + XTime_GetTime(&start); + sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS); + XTime_GetTime(&stop); + sw_elapsed = get_elapsed_time(start, stop); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Press any key to start:\r\n"); + dummy = inbyte(); + //PRINTF("INFO:"); + + /* ****** HW ACCELERATOR ****** */ + PRINTF("INFO: Start HW accelerator\r\n"); + + XTime_GetTime(&start); + Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + XTime_GetTime(&stop); + cache_elapsed = get_elapsed_time(start, stop); + + /*hls-fpga-machine-learning insert configure weights*/ + + /*hls-fpga-machine-learning insert load weights on*/ + + /*hls-fpga-machine-learning insert start and wait*/ + + + for (unsigned j = 0; j < N_SAMPLES; j++) { + float *inputs_mem_i = inputs_mem + j * N_X_INPUTS; + float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS; + + /* Configure the accelerator */ + XTime_GetTime(&start); + XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */ + XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */ + + /*hls-fpga-machine-learning insert load weights off*/ + + XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */ + while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */ + + /* Get error status */ + //hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */ + XTime_GetTime(&stop); + hw_elapsed += get_elapsed_time(start, stop); + } + + XTime_GetTime(&start); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + XTime_GetTime(&stop); + cache_elapsed += get_elapsed_time(start, stop); + + PRINTF("INFO: HW accelerator done!\r\n"); + + /* ****** VALIDATION ****** */ + PRINTF("INFO: ================== Verification ==================\r\n"); +#ifdef __DEBUG__ + PRINTF("INFO: Dump data\r\n"); + dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS); + dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS); + dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS); +#endif + +#ifdef __DEBUG__ + PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed); +#endif + PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES); + PRINTF("INFO: - total %f sec\r\n", hw_elapsed); + PRINTF("INFO: - per-inference %.12f sec (%f ns)\r\n", hw_elapsed / (N_SAMPLES), (hw_elapsed*1000.0) / (N_SAMPLES)); + PRINTF("INFO: Cache flush time: %f sec\r\n", cache_elapsed); +#ifdef __DEBUG__ + PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed)); +#endif + + hw_errors = 0; +#if 1 + /* Accelerator verification */ + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + if (outputs_mem[i] != reference_mem[i]) { + PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]); + hw_errors++; + } + } + PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS); + if (hw_errors > 0) + PRINTF("INFO: Verification: FAIL\r\n"); + else + PRINTF("INFO: Verification: PASS!\r\n"); +#else + /* Accelerator validation */ + for (unsigned s = 0; s < N_SAMPLES; s++) { + unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + if (hw_digit != ref_digit) { +#ifdef __DEBUG__ + PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit); +#endif + hw_errors++; + } + } + float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0; + float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0); + PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES); + PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate); + PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy); +#endif + PRINTF("INFO: ==================================================\r\n"); + + cleanup_platform(); + + return 0; +} + + diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c new file mode 100644 index 0000000000..8a46df8bde --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c @@ -0,0 +1,6 @@ +#include "xil_printf.h" + +int main(void) { + xil_printf("Hello world!\r\n"); + return 0; +} diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h new file mode 100644 index 0000000000..8a46df8bde --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h @@ -0,0 +1,6 @@ +#include "xil_printf.h" + +int main(void) { + xil_printf("Hello world!\r\n"); + return 0; +} diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile new file mode 100644 index 0000000000..03ab9b8de7 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile @@ -0,0 +1,33 @@ +DESIGN := design_1 + +help: + @echo "INFO: make to show targets" +.PHONY: help + +--setup: + xsct ./setup.tcl $(DESIGN) +.PHONY: --setup + +sdk: --setup + rm -f $(DESIGN)_standalone/src/helloworld.c + cd $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c + cd $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h +.PHONY: sdk + +gui: + xsdk --workspace . & +.PHONY: gui + +clean: + rm -rf $(DESIGN)_platform + rm -rf $(DESIGN)_standalone + rm -rf $(DESIGN)_standalone_bsp + rm -rf RemoteSystemsTempFiles + rm -rf .Xil + rm -rf .metadata + rm -f *.log +.PHONY: clean + +ultraclean: clean + rm -rf hdf/*.hdf +.PHONY: ultraclean diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl new file mode 100644 index 0000000000..ea386d4281 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl @@ -0,0 +1,18 @@ +# See +# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html + +setws . +if { $::argc == 1 } { + set myproject [lindex $::argv 0] + createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf + createapp -name ${myproject}\_standalone -app {Hello World} -proc psu_cortexa53_0 -hwproject ${myproject}\_platform -os standalone -arch 64 + configbsp -bsp ${myproject}\_standalone_bsp stdin psu_uart_1 + configbsp -bsp ${myproject}\_standalone_bsp stdout psu_uart_1 + updatemss -mss ${myproject}\_standalone_bsp/system.mss + regenbsp -bsp ${myproject}\_standalone_bsp + configapp -app ${myproject}\_standalone build-config release + configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000} + configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000} + projects -build + #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE} +} diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_master_driver.py b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_master_driver.py new file mode 100644 index 0000000000..7e2419b8f5 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_master_driver.py @@ -0,0 +1,136 @@ +from pynq import DefaultHierarchy, DefaultIP, allocate +from pynq import Overlay +from datetime import datetime +import pynq.lib.dma +import numpy as np + + +class NeuralNetworkOverlay(Overlay): + #def __init__(self, bitfile_name, x_shape, y_shape, w2_shape, b2_shape, w5_shape, b5_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None): + #hls-fpga-machine-learning insert init + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + self.regin = self.myproject_axi_0.register_map.in_r.address + self.regout = self.myproject_axi_0.register_map.out_r.address + # + #self.regw2 = self.myproject_axi_0.register_map.w2.address + #self.regb2 = self.myproject_axi_0.register_map.b2.address + #self.regw5 = self.myproject_axi_0.register_map.w5.address + #self.regb5 = self.myproject_axi_0.register_map.b5.address + #hls-fpga-machine-learning insert registers + self.reglw = self.myproject_axi_0.register_map.load_weights.address + # + self.ctrl = self.myproject_axi_0.register_map.CTRL + self.input_buffer = allocate(shape=x_shape, dtype=dtype) + self.output_buffer = allocate(shape=y_shape, dtype=dtype) + #self.w2_buffer = allocate(shape=w2_shape, dtype=dtype) + #self.b2_buffer = allocate(shape=b2_shape, dtype=dtype) + #self.w5_buffer = allocate(shape=w5_shape, dtype=dtype) + #self.b5_buffer = allocate(shape=b5_shape, dtype=dtype) + #hls-fpga-machine-learning insert buffers + def _print_dt(self, timea, timeb, N): + dt = (timeb - timea) + dts = dt.seconds + dt.microseconds * 10 ** -6 + rate = N / dts + print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate)) + return dts, rate + +# def load_weights(self, w2, b2, w5, b5, debug=False, profile=False, encode=None): + #hls-fpga-machine-learning insert load weights + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - w*, b* : the weight and bias vectors. Should be numpy ndarray. + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode: function pointers. See `dtype` section for more information. + """ + if profile: + timea = datetime.now() + if encode is not None: + #w2 = encode(w2) + #b2 = encode(b2) + #w5 = encode(w5) + #b5 = encode(b5) + #hls-fpga-machine-learning insert encode + # + #self.w2_buffer[:] = w2 + #self.b2_buffer[:] = b2 + #self.w5_buffer[:] = w5 + #self.b5_buffer[:] = b5 + #hls-fpga-machine-learning insert set buffers + # + #self.myproject_axi_0.write(self.regw2, self.w2_buffer.physical_address) + #self.myproject_axi_0.write(self.regb2, self.b2_buffer.physical_address) + #self.myproject_axi_0.write(self.regw5, self.w5_buffer.physical_address) + #self.myproject_axi_0.write(self.regb5, self.b5_buffer.physical_address) + #hls-fpga-machine-learning insert set registers + # + self.myproject_axi_0.write(self.reglw, 0x1) + # + self.myproject_axi_0.write(self.ctrl.AP_START, 0x1) + if debug: + print("Config OK") + while not self.ctrl.AP_DONE: + if debug: + print("Polling...") + if debug: + print("Done OK") + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + + + def predict(self, X, debug=False, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + if profile: + timea = datetime.now() + if encode is not None: + X = encode(X) + self.input_buffer[:] = X + self.myproject_axi_0.write(self.regin, self.input_buffer.physical_address) + self.myproject_axi_0.write(self.regout, self.output_buffer.physical_address) + # + self.myproject_axi_0.write(self.reglw, 0x0) + # + self.myproject_axi_0.write(self.ctrl.AP_START, 0x1) + if debug: + print("Config OK") + while not self.ctrl.AP_DONE: + if debug: + print("Polling...") + if debug: + print("Done OK") + # result = self.output_buffer.copy() + if decode is not None: + self.output_buffer = decode(self.output_buffer) + + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + return self.output_buffer, dts, rate + else: + return self.output_buffer diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py new file mode 100644 index 0000000000..4adb187ab4 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py @@ -0,0 +1,75 @@ +from pynq import DefaultHierarchy, DefaultIP, allocate +from pynq import Overlay +from datetime import datetime +import pynq.lib.dma +import numpy as np + + +class NeuralNetworkOverlay(Overlay): + def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, + device=None): + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + self.sendchannel = self.hier_0.axi_dma_0.sendchannel + self.recvchannel = self.hier_0.axi_dma_0.recvchannel + self.input_buffer = allocate(shape=x_shape, dtype=dtype) + self.output_buffer = allocate(shape=y_shape, dtype=dtype) + + def _print_dt(self, timea, timeb, N): + dt = (timeb - timea) + dts = dt.seconds + dt.microseconds * 10 ** -6 + rate = N / dts + print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate)) + return dts, rate + + def predict(self, X, debug=False, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + if profile: + timea = datetime.now() + if encode is not None: + X = encode(X) + self.input_buffer[:] = X + self.sendchannel.transfer(self.input_buffer) + self.recvchannel.transfer(self.output_buffer) + if debug: + print("Transfer OK") + self.sendchannel.wait() + if debug: + print("Send OK") + self.recvchannel.wait() + if debug: + print("Receive OK") + # result = self.output_buffer.copy() + if decode is not None: + self.output_buffer = decode(self.output_buffer) + + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + return self.output_buffer, dts, rate + else: + return self.output_buffer \ No newline at end of file diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl new file mode 100644 index 0000000000..2df93afca5 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl @@ -0,0 +1,26 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${myproject}_vivado_accelerator -part xczu3eg-sbva484-1-e -force + +set_property board_part em.avnet.com:ultra96:part0:1.2 [current_project] +set_property ip_repo_paths ${myproject}_prj [current_project] +update_ip_catalog + +# Create Block Designer design +create_bd_design "design_1" +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells zynq_ultra_ps_e] +create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0 +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${myproject}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins ${myproject}_axi_0/s_axi_AXILiteS] + +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top +add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl new file mode 100644 index 0000000000..0d5eb1a89e --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl @@ -0,0 +1,100 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +# Project names +set design_name "design_1" +set hls_solution_name "solution1" +set ps_name "zynq_ultra_ps_e_0" +set acc_name "${project_name}_axi_0" + +# Board and chip part names +create_project ${project_name} ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force +set_property board_part avnet.com:ultra96v2:part0:1.2 [current_project] + +# Create block design +create_bd_design ${design_name} + +# Setup IP repo +#set_property ip_repo_paths ${project_name}_prj [current_project] +set_property ip_repo_paths ${project_name}_prj/${hls_solution_name}/impl/ip [current_project] +update_ip_catalog + +# Create and setup PS +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 ${ps_name} +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells ${ps_name}] +set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells ${ps_name}] +set_property -dict [list CONFIG.PSU__MAXIGP0__DATA_WIDTH {32} CONFIG.PSU__MAXIGP1__DATA_WIDTH {32}] [get_bd_cells ${ps_name}] + +# Create accelerator +create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name} + +# Wiring +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {Auto} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master "/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD" \ + Slave "/myproject_axi_0/s_axi_CTRL_BUS" \ + intc_ip {New AXI Interconnect} \ + master_apm {0}} [get_bd_intf_pins ${acc_name}/s_axi_CTRL_BUS] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {Auto} \ + Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Master "/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD" \ + Slave "/myproject_axi_0/s_axi_CTRL_BUS" \ + intc_ip {/ps8_0_axi_periph} \ + master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master "/myproject_axi_0/m_axi_IN_BUS" \ + Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \ + intc_ip {Auto} \ + master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Master "/myproject_axi_0/m_axi_OUT_BUS" \ + Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \ + intc_ip {/axi_smc} \ + master_apm {0}} [get_bd_intf_pins ${acc_name}/m_axi_OUT_BUS] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Master "/myproject_axi_0/m_axi_MODEL_BUS" \ + Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \ + intc_ip {/axi_smc} \ + master_apm {0}} [get_bd_intf_pins myproject_axi_0/m_axi_MODEL_BUS] + +# Wiring interrupt signal +connect_bd_net [get_bd_pins ${acc_name}/interrupt] [get_bd_pins ${ps_name}/pl_ps_irq0] + +# Top level wrapper +make_wrapper -files [get_files ./${project_name}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top +add_files -norecurse ./${project_name}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v + +# Memory mapping +delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_HPC0_LPS_OCM] +delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_HPC0_LPS_OCM] + +# Run synthesis and implementation +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +# Reporting +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages + +# Export HDF file for SDK flow +file mkdir ./hdf +file copy -force ${project_name}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl new file mode 100644 index 0000000000..4721b59941 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl @@ -0,0 +1,58 @@ +#@todo: try to remove startgroup and endgroup and see if it work +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${myproject}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force + +set_property board_part em.avnet.com:ultra96:part0:1.2 [current_project] +set_property ip_repo_paths ${myproject}_prj [current_project] +update_ip_catalog + +create_bd_design "design_1" +set_property ip_repo_paths ${myproject}_prj/solution1/impl/ip [current_project] +update_ip_catalog + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0 +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_0] + +set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0] + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 +endgroup +set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD] +endgroup + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD] +endgroup + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0 +endgroup +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${myproject}_axi_0/in_r] +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${myproject}_axi_0/out_r] + +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${myproject}_axi_0/ap_clk] +group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${myproject}_axi_0] + +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top + +add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py index f979b60321..cbcd1d3000 100644 --- a/hls4ml/writer/vivado_accelerator_writer.py +++ b/hls4ml/writer/vivado_accelerator_writer.py @@ -17,6 +17,10 @@ def write_axi_wrapper(self, model): inp_axi_t, out_axi_t, inp, out = self.vivado_accelerator_config.get_corrected_types() indent = ' ' + io_type = model.config.get_config_value('IOType') + interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None + config_weights = (io_type == 'io_stream') and (interface == 'axi_master') + ####################### ## myproject_axi.h ####################### @@ -32,6 +36,11 @@ def write_axi_wrapper(self, model): newline = '#include "{}.h"\n'.format(model.config.get_project_name()) elif 'void myproject(' in line: newline = 'void {}_axi(\n'.format(model.config.get_project_name()) + elif config_weights and '//hls-fpga-machine-learning insert weights' in line: + newline = '' + for v in model.get_weight_variables(): + newline += indent + ', model_axi_t {name} [{shape}]\n'.format(name=v.name, shape=v.data_length) + newline += ', char load_weights' elif '//hls-fpga-machine-learning insert definitions' in line: newline = '' newline += 'static const unsigned N_IN = {};\n'.format(inp.size()) @@ -66,6 +75,9 @@ def write_axi_wrapper(self, model): else: newline += 'typedef {} input_axi_t;\n'.format(inp_axi_t) newline += 'typedef {} output_axi_t;\n'.format(out_axi_t) + if config_weights: + newline += 'typedef {} model_axi_t; // FIXME: Arbitrary choice type of the inputs and weights are the same\n'\ + .format(inp_axi_t) else: newline = line fout.write(newline) @@ -87,6 +99,11 @@ def write_axi_wrapper(self, model): newline = 'void {}_axi(\n'.format(model.config.get_project_name()) elif '//hls-fpga-machine-learning insert include' in line: newline = '#include "{}_axi.h"\n'.format(model.config.get_project_name()) + elif config_weights and '//hls-fpga-machine-learning insert weights' in line: + newline = '' + for v in model.get_weight_variables(): + newline += indent + ', model_axi_t {name} [{shape}]\n'.format(name=v.name, shape=v.data_length) + newline += indent + ', char load_weights' elif '//hls-fpga-machine-learning insert local vars' in line: newline = '' if self.vivado_accelerator_config.get_interface() == 'axi_stream': @@ -101,9 +118,22 @@ def write_axi_wrapper(self, model): .format(model.get_input_variables()[0].pragma[1]) newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'\ .format(model.get_output_variables()[0].pragma[1]) + if config_weights: + newline += '\n' + for v in model.get_weight_variables(): + newline += indent + 'static {dtype} {name}_local [{shape}];\n'.format(dtype=v.type.name, name=v.name, shape=v.data_length) elif '//hls-fpga-machine-learning insert call' in line: - newline = indent + '{}(in_local, out_local);\n'.format( - model.config.get_project_name()) + if config_weights: + newline = '' + weight_string='' + for v in model.get_weight_variables(): + weight_string+=','+v.name+'_local' + newline = indent + indent + '{}(in_local, out_local'.format(model.config.get_project_name()) + newline += weight_string + newline += ');\n' + else: + newline = indent + '{}(in_local, out_local);\n'.format( + model.config.get_project_name()) elif '//hls-fpga-machine-learning insert interface' in line: if self.vivado_accelerator_config.get_interface() == 'axi_lite': newline = '' @@ -117,6 +147,12 @@ def write_axi_wrapper(self, model): .format(model.get_input_variables()[0].pragma[1]) newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'\ .format(model.get_output_variables()[0].pragma[1]) + if config_weights: + newline += '\n' + for v in model.get_weight_variables(): + newline += indent + '#pragma HLS INTERFACE m_axi depth=1 port={} offset=slave bundle=MODEL_BUS\n'\ + .format(v.name) + newline += indent + '#pragma HLS INTERFACE s_axilite port=load_weights bundle=CTRL_BUS\n' elif self.vivado_accelerator_config.get_interface() == 'axi_stream': newline = '' newline += indent + '#pragma HLS INTERFACE axis port=in\n' @@ -124,6 +160,15 @@ def write_axi_wrapper(self, model): newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' if model.config.get_config_value("IOType") == 'io_stream': newline += indent + '#pragma HLS DATAFLOW\n' + elif config_weights and '//hls-fpga-machine-learning insert enqueue weights' in line: + newline = '' + newline += indent + 'if (load_weights) {' + for v in model.get_weight_variables(): + newline += indent + indent + 'for (unsigned i = 0; i < {shape}; i++)\n'\ + .format(shape=v.data_length) + newline += indent + indent + indent + '{name}_local[i] = {name}[i];\n'\ + .format(dtype=v.type.name, name=v.name, shape=v.data_length) + newline += indent + '} else {' elif '//hls-fpga-machine-learning insert enqueue' in line: io_type = model.config.get_config_value("IOType") if io_type == 'io_parallel': @@ -138,6 +183,8 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'in_local[i] = in[i]; // Read input with cast\n' newline += indent + '}\n' elif io_type == 'io_stream': + if config_weights: + indent = indent + indent newline = '' newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n' # newline += indent + indent + '#pragma HLS PIPELINE\n' @@ -154,6 +201,8 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'in_local.write(ctype);\n' newline += indent + '}}\n' newline = newline.format(input_t=inp.type.name) + if config_weights: + indent = ' ' elif '//hls-fpga-machine-learning insert dequeue' in line: io_type = model.config.get_config_value("IOType") if io_type == 'io_parallel': @@ -168,6 +217,8 @@ def write_axi_wrapper(self, model): newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n' newline += indent + '}\n' elif io_type == 'io_stream': + if config_weights: + indent = indent + indent newline = '' newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n' # newline += indent + indent + '#pragma HLS PIPELINE\n' @@ -182,6 +233,9 @@ def write_axi_wrapper(self, model): newline += indent + indent + '}}\n' newline += indent + '}}\n' newline = newline.format(result_t=out.type.name) + if config_weights: + indent = ' ' + newline += indent + '}' else: newline = line fout.write(newline) @@ -232,6 +286,10 @@ def modify_build_script(self, model): def write_wrapper_test(self, model): + io_type = model.config.get_config_value('IOType') + interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None + config_weights = (io_type == 'io_stream') and (interface == 'axi_master') + ################### # write myproject_test_wrapper.cpp ################### @@ -256,7 +314,11 @@ def write_wrapper_test(self, model): newline = '' elif '{}('.format(model.config.get_project_name()) in line: indent_amount = line.split(model.config.get_project_name())[0] - newline = indent_amount + '{}_axi(inputs,outputs);\n'.format(model.config.get_project_name()) + if config_weights: + newline = line.replace(model.config.get_project_name(),model.config.get_project_name()+'_axi').\ + replace(inp.name,'inputs').replace(out.name,'outputs') + else: + newline = indent_amount + '{}_axi(inputs,outputs);\n'.format(model.config.get_project_name()) elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'input_axi_t') @@ -288,6 +350,7 @@ def write_wrapper_test(self, model): inp = model.get_input_variables()[0] out = model.get_output_variables()[0] + brams = model.get_weight_variables() for line in f.readlines(): if '{}.h'.format(model.config.get_project_name()) in line: @@ -301,8 +364,13 @@ def write_wrapper_test(self, model): 'output_axi_t {}_ap[N_OUT]'.format(out.name)) elif '{}('.format(model.config.get_project_name()) in line: indent_amount = line.split(model.config.get_project_name())[0] - newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(model.config.get_project_name(), inp.name, - out.name) + if config_weights: + newline = line.replace(model.config.get_project_name(),model.config.get_project_name()+'_axi') + for b in brams: + newline = newline.replace(b.name, b.name+'_ap') + else: + newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(model.config.get_project_name(), inp.name, + out.name) elif inp.size_cpp() in line or inp.name in line or inp.type.name in line: newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'input_axi_t') elif out.size_cpp() in line or out.name in line or out.type.name in line: @@ -315,6 +383,71 @@ def write_wrapper_test(self, model): fout.close() os.rename(newfile, oldfile) + def modify_python_driver(self, model): + + io_type = model.config.get_config_value('IOType') + interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None + driver = model.config.get_config_value('AcceleratorConfig')['Driver'] if model.config.get_config_value('AcceleratorConfig') else None + config_weights = (io_type == 'io_stream') and (interface == 'axi_master') + + if driver == 'c' or io_type != 'io_stream' or interface != 'axi_master': + return; + + ################### + # write axi_master_driver.py + ################### + oldfile = '{}/axi_master_driver.py'.format(model.config.get_output_dir(), model.config.get_project_name()) + newfile = '{}/axi_master_driveri.NEW.py'.format(model.config.get_output_dir(), model.config.get_project_name()) + + f = open(oldfile, 'r') + fout = open(newfile, 'w') + + indent = ' ' + brams = model.get_weight_variables() + + for line in f.readlines(): + if '#hls-fpga-machine-learning insert init' in line: + newline = line + w_shapes = '' + for b in brams: + w_shapes += b.name + '_shape, ' + newline += indent + 'def __init__(self, bitfile_name, x_shape, y_shape, {}dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None):\n'\ + .format(w_shapes) + elif '#hls-fpga-machine-learning insert registers' in line: + newline = line + for b in brams: + newline += indent + indent + 'self.reg{} = self.myproject_axi_0.register_map.{}.address\n'.format(b.name, b.name) + elif '#hls-fpga-machine-learning insert buffers' in line: + newline = line + for b in brams: + newline += indent + indent + 'self.{}_buffer = allocate(shape={}_shape, dtype=dtype)\n'.format(b.name, b.name) + elif '#hls-fpga-machine-learning insert load weights' in line: + newline = line + weights = '' + for b in brams: + weights += b.name + ', ' + newline = 'def load_weights(self, {}debug=False, profile=False, encode=None):\n'.format(weights) + elif '#hls-fpga-machine-learning insert encode' in line: + newline = line + for b in brams: + newline += indent + indent + '{} = encode({})\n'.format(b.name, b.name) + elif '#hls-fpga-machine-learning insert set buffers' in line: + newline = line + for b in brams: + newline += indent + indent + 'self.{}_buffer[:] = {}\n'.format(b.name, b.name) + elif '#hls-fpga-machine-learning insert set registers' in line: + newline = line + for b in brams: + newline += indent + indent + 'self.myproject_axi_0.write(self.reg{}, self.{}_buffer.physical_address)\n'.format(b.name, b.name) + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + + def write_board_script(self, model): ''' Write the tcl scripts and kernel sources to create a Vivado IPI project for the VivadoAccelerator @@ -348,14 +481,133 @@ def write_board_script(self, model): def write_driver(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) - copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()), - ('{}/' + self.vivado_accelerator_config.get_driver_file()).format(model.config.get_output_dir())) - + srcfiles = os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()) + dstfiles = ('{}/' + self.vivado_accelerator_config.get_driver_files()).format(model.config.get_output_dir()) + if os.path.isdir(srcfiles): + copytree(srcfiles, dstfiles, dirs_exist_ok=True) + else: + copyfile(srcfiles, dstfiles) + def write_new_tar(self, model): os.remove(model.config.get_output_dir() + '.tar.gz') super(VivadoAcceleratorWriter, self).write_tar(model) - + def write_standalone_app(self, model): + + indent = ' ' + + weights = model.get_weight_variables() + + io_type = model.config.get_config_value('IOType') + interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None + driver = model.config.get_config_value('AcceleratorConfig')['Driver'] if model.config.get_config_value('AcceleratorConfig') else None + config_weights = (io_type == 'io_stream') and (interface == 'axi_master') + + if driver == 'python': + return; + + ####################### + ## main.c + ####################### + + filedir = os.path.dirname(os.path.abspath(__file__)) + f = open(os.path.join(filedir, '../templates/vivado_accelerator/standalone_main.c'), 'r') + fout = open('{}/sdk/common/main.c'.format(model.config.get_output_dir()), 'w') + + for line in f.readlines(): + + if config_weights and '/*hls-fpga-machine-learning insert configure weights*/' in line: + newline = line + for w in weights: + newline += indent + 'XMyproject_axi_Set_{name}(&accelerator, {name}); /* TODO: design-dependent name */\n'.format(name=w.name) + elif config_weights and '/*hls-fpga-machine-learning insert load weights on*/' in line: + newline = line + newline += indent + 'XMyproject_axi_Set_load_weights(&accelerator, 1); /* TODO: design-dependent name */\n' + elif config_weights and '/*hls-fpga-machine-learning insert load weights off*/' in line: + newline = line + newline += indent + indent + 'XMyproject_axi_Set_load_weights(&accelerator, 0); /* TODO: design-dependent name */\n' + elif config_weights and '/*hls-fpga-machine-learning insert start and wait*/' in line: + newline = line + newline += indent + 'XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */\n' + newline += indent + 'while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */\n' + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + + def write_header_file(model, X, y, y_keras, y_hls, n_samples, filename='data.h'): + #TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package + from hls4ml.backends import VivadoAcceleratorConfig + vivado_accelerator_config = VivadoAcceleratorConfig(model.config, model.get_input_variables(), + model.get_output_variables()) + inp_axi_t, out_axi_t, inp, out = vivado_accelerator_config.get_corrected_types() + header_file = open(filename, 'w') + (n_X_samples, n_X_inputs) = X.shape + (n_y_samples, n_y_outputs) = y.shape + (n_y_keras_samples, n_y_keras_outputs) = y_keras.shape + (n_y_hls_samples, n_y_hls_outputs) = y_hls.shape + + header_file.write('#ifndef __DATA_H__\n') + header_file.write('#define __DATA_H__\n') + header_file.write('/* out of {} */\n'.format(n_X_samples)) + header_file.write('#define N_SAMPLES {}\n'.format(n_samples)) + header_file.write('\n') + + import numpy as np + for layer in model.get_layers(): + for weights in layer.get_weights(): + header_file.write('#define N_{name} {size}\n'.format(name=weights.name.upper(), size=np.prod(weights.shape))) + header_file.write('const {dtype} {name}[N_{uname}] = {{\n'.format(dtype=inp_axi_t, name=weights.name, uname=weights.name.upper())) + for w in weights: + header_file.write(w + ',') + header_file.write('};\n\n') + + header_file.write('#define N_X_INPUTS {}\n'.format(n_X_inputs)) + header_file.write('const {} data_X_inputs[N_SAMPLES*N_X_INPUTS] = {{\n'.format(inp_axi_t)) + for s in range(n_samples): + header_file.write(' ') + for i in range(n_X_inputs): + header_file.write('{}, '.format(X[s][i])) + header_file.write('\n') + header_file.write('};\n') + header_file.write('\n') + header_file.write('/* Ground truth - for validation */\n') + header_file.write('#define N_Y_OUTPUTS {}\n'.format(n_y_outputs)) + header_file.write('const float data_y_outputs[N_SAMPLES*N_Y_OUTPUTS] = {\n') + for s in range(n_samples): + header_file.write(' ') + for o in range(n_y_outputs): + header_file.write('{}, '.format(y[s][o])) + header_file.write('\n') + header_file.write('};\n') + header_file.write('\n') + header_file.write('/* Keras outputs - for validation */\n') + header_file.write('#define N_Y_KERAS_OUTPUTS {}\n'.format(n_y_keras_outputs)) + header_file.write('') + header_file.write('const float data_y_keras_outputs[N_SAMPLES*N_Y_KERAS_OUTPUTS] = {\n') + for s in range(n_samples): + header_file.write(' ') + for o in range(n_y_keras_outputs): + header_file.write('{}, '.format(y_keras[s][o])) + header_file.write('\n') + header_file.write('};\n') + header_file.write('\n') + header_file.write('/* csim outputs - for verification */\n') + header_file.write('#define N_Y_HLS_OUTPUTS {}\n'.format(n_y_hls_outputs)) + header_file.write('') + header_file.write('const {} data_y_hls_outputs[N_SAMPLES*N_Y_HLS_OUTPUTS] = {{\n'.format(out_axi_t)) + for s in range(n_samples): + header_file.write(' ') + for o in range(n_y_hls_outputs): + header_file.write('{}, '.format(y_hls[s][o])) + header_file.write('\n') + header_file.write('};\n') + header_file.write('#endif\n') + header_file.close() + + def write_hls(self, model): """ Write the HLS project. Calls the VivadoBackend writer, and extra steps for VivadoAccelerator/AXI interface @@ -369,6 +621,8 @@ def write_hls(self, model): self.write_driver(model) self.write_wrapper_test(model) self.write_axi_wrapper(model) + self.write_standalone_app(model) + self.modify_python_driver(model) self.modify_build_script(model) self.write_new_tar(model) diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index af1f1b61ba..8494b49fb4 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -109,6 +109,10 @@ def write_project_cpp(self, model): model_outputs = model.get_output_variables() model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + io_type = model.config.get_config_value('IOType') + interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None + config_weights = (io_type == 'io_stream') and (interface == 'axi_master') + indent = ' ' for line in f.readlines(): @@ -159,7 +163,11 @@ def write_project_cpp(self, model): if io_type == 'io_stream': newline += indent + '#pragma HLS INTERFACE axis port={},{} \n'.format(','.join(all_inputs), ','.join(all_outputs)) if all_brams: - newline += indent + '#pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams)) + if config_weights: + newline += indent + '//#pragma HLS INTERFACE bram port={} // Disabled (it fails on FPGA otherwise)\n'\ + .format(','.join(all_brams)) + else: + newline += indent + '#pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams)) newline += indent + '#pragma HLS DATAFLOW \n' elif '//hls-fpga-machine-learning insert layers' in line: @@ -337,6 +345,10 @@ def write_test_bench(self, model): ## test bench ################### + io_type = model.config.get_config_value('IOType') + interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None + config_weights = (io_type == 'io_stream') and (interface == 'axi_master') + filedir = os.path.dirname(os.path.abspath(__file__)) if not os.path.exists('{}/tb_data/'.format(model.config.get_output_dir())): @@ -362,7 +374,7 @@ def write_test_bench(self, model): model_inputs = model.get_input_variables() model_outputs = model.get_output_variables() - model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram' or config_weights] for line in f.readlines(): indent = ' ' * (len(line) - len(line.lstrip(' '))) @@ -370,10 +382,19 @@ def write_test_bench(self, model): #Insert numbers if 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) - elif '//hls-fpga-machine-learning insert bram' in line: + elif (not config_weights) and '//hls-fpga-machine-learning insert bram' in line: newline = line for bram in model_brams: newline += '#include \"firmware/weights/{}.h\"\n'.format(bram.name) + elif config_weights and '//hls-fpga-machine-learning insert weights' in line: + newline = line + for v in model.get_weight_variables(): + newline += indent + 'model_axi_t {name}[{shape}];\n'.format(name=v.name, shape=v.data_length) + elif config_weights and '//hls-fpga-machine-learning insert load weights' in line: + newline = line + for v in model.get_weight_variables(): + newline += indent + 'nnet::load_weights_from_txt({name}, "{name}.txt");\n'\ + .format(name=v.name, shape=v.data_length) elif '//hls-fpga-machine-learning insert data' in line: newline = line offset = 0 @@ -400,7 +421,12 @@ def write_test_bench(self, model): # Concatenate the input, output, and bram variables. Filter out empty/null values all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) - top_level = indent + '{}({});\n'.format(model.config.get_project_name(), all_vars) + if config_weights: + top_level = indent + '{}({},/*load_weights*/true);\n'.format(model.config.get_project_name(), all_vars) + newline += top_level + top_level = indent + '{}({},/*load_weights*/false);\n'.format(model.config.get_project_name(), all_vars) + else: + top_level = indent + '{}({});\n'.format(model.config.get_project_name(), all_vars) newline += top_level elif '//hls-fpga-machine-learning insert predictions' in line: @@ -429,6 +455,10 @@ def write_bridge(self, model): # c++-python bridge ################### + io_type = model.config.get_config_value('IOType') + interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None + config_weights = (io_type == 'io_stream') and (interface == 'axi_master') + filedir = os.path.dirname(os.path.abspath(__file__)) f = open(os.path.join(filedir,'../templates/vivado/myproject_bridge.cpp'),'r') fout = open('{}/{}_bridge.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()),'w') @@ -455,8 +485,11 @@ def write_bridge(self, model): outputs_str = ', '.join(['{type} {name}[{shape}]'.format(type=dtype, name=o.name, shape=o.size_cpp()) for o in model_outputs]) newline = '' - newline += indent + inputs_str + ',\n' - newline += indent + outputs_str + '\n' + newline += indent + inputs_str + '\n' + newline += indent + ', ' + outputs_str + '\n' + if config_weights: + for v in model.get_weight_variables(): + newline += indent + ', {type} {name} [{shape}]\n'.format(type=dtype, name=v.name, shape=v.data_length) elif '//hls-fpga-machine-learning insert wrapper' in line: dtype = line.split('#', 1)[1].strip() newline = '' @@ -465,6 +498,12 @@ def write_bridge(self, model): newline += indent + 'nnet::convert_data<{}, {}, {}>({}, {}_ap);\n'.format(dtype, i.type.name, i.size_cpp(), i.name, i.name) newline += '\n' + if config_weights: + for b in model_brams: + newline += indent + 'model_axi_t {name}_ap[{shape}];\n'.format(name=b.name, shape=b.data_length) + newline += indent + 'nnet::convert_data<{}, {}, {}>({}, {}_ap);\n'.format(dtype, 'model_axi_t', b.data_length, b.name, b.name) + newline += '\n' + for o in model_outputs: newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap')) @@ -476,8 +515,11 @@ def write_bridge(self, model): # Concatenate the input, output, and bram variables. Filter out empty/null values all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars])) - - top_level = indent + '{}({});\n'.format(model.config.get_project_name(), all_vars) + if config_weights: + top_level = indent + '{}({},/*load_weights*/true);\n'.format(model.config.get_project_name(), all_vars) + top_level += indent + '{}({},/*load_weights*/false);\n'.format(model.config.get_project_name(), all_vars) + else: + top_level = indent + '{}({});\n'.format(model.config.get_project_name(), all_vars) newline += top_level newline += '\n'