diff --git a/hls4ml/backends/vivado_accelerator/supported_boards.json b/hls4ml/backends/vivado_accelerator/supported_boards.json index 1279ec22d0..08b4485bf0 100644 --- a/hls4ml/backends/vivado_accelerator/supported_boards.json +++ b/hls4ml/backends/vivado_accelerator/supported_boards.json @@ -1,8 +1,8 @@ { "pynq-z2": { "part": "xc7z020clg400-1", - "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl", "axi_master": "axi_master_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py", "axi_master": "axi_master_driver.py"}, "c_drivers": {} }, "zcu102": { diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp index 7a06633e58..f3490739aa 100644 --- a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp +++ b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp @@ -9,6 +9,8 @@ void myproject( //hls-fpga-machine-learning insert local vars + //hls-fpga-machine-learning insert load weights + //hls-fpga-machine-learning insert enqueue //hls-fpga-machine-learning insert call diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/python_drivers/axi_master_driver.py b/hls4ml/templates/vivado_accelerator/pynq-z2/python_drivers/axi_master_driver.py new file mode 100644 index 0000000000..c0a197e91e --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z2/python_drivers/axi_master_driver.py @@ -0,0 +1,76 @@ +from pynq import DefaultHierarchy, DefaultIP, allocate +from pynq import Overlay +from datetime import datetime +import pynq.lib.dma +import numpy as np + + +class NeuralNetworkOverlay(Overlay): + def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, + device=None): + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + self.regin = self.myproject_axi_0.register_map.in_r.address + self.regout = self.myproject_axi_0.register_map.out_r.address + self.ctrl = self.myproject_axi_0.register_map.CTRL + self.input_buffer = allocate(shape=x_shape, dtype=dtype) + self.output_buffer = allocate(shape=y_shape, dtype=dtype) + + def _print_dt(self, timea, timeb, N): + dt = (timeb - timea) + dts = dt.seconds + dt.microseconds * 10 ** -6 + rate = N / dts + print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate)) + return dts, rate + + def predict(self, X, debug=False, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + if profile: + timea = datetime.now() + if encode is not None: + X = encode(X) + self.input_buffer[:] = X + self.myproject_axi_0.write(self.regin, self.input_buffer.physical_address) + self.myproject_axi_0.write(self.regout, self.output_buffer.physical_address) + self.myproject_axi_0.write(self.ctrl.AP_START, 0x1) + if debug: + print("Config OK") + while not self.ctrl.AP_DONE: + if debug: + print("Polling...") + if debug: + print("Done OK") + # result = self.output_buffer.copy() + if decode is not None: + self.output_buffer = decode(self.output_buffer) + + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + return self.output_buffer, dts, rate + else: + return self.output_buffer diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl new file mode 100644 index 0000000000..2dc70f81b7 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl @@ -0,0 +1,93 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +# Project names +set design_name "design_1" +set hls_solution_name "solution1" +set ps_name "processing_system7_0" +set acc_name "${project_name}_axi_0" + +# Board and chip part names +create_project ${project_name} ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force +set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] + +# Create block design +create_bd_design ${design_name} + +# Setup IP repo +#set_property ip_repo_paths ${project_name}_prj [current_project] +set_property ip_repo_paths ${project_name}_prj/${hls_solution_name}/impl/ip [current_project] +update_ip_catalog + +# Create and setup PS +create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 ${ps_name} +apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config " \ + make_external {FIXED_IO, DDR} \ + apply_board_preset {1} \ + Master {Disable} \ + Slave {Disable} " [get_bd_cells ${ps_name}] +set_property -dict [list \ + CONFIG.PCW_USE_S_AXI_GP0 {1} \ + CONFIG.PCW_USE_FABRIC_INTERRUPT {1} \ + CONFIG.PCW_IRQ_F2P_INTR {1}\ + ] [get_bd_cells ${ps_name}] + +# Create accelerator +create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name} + +# Wiring +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config " \ + Clk_master {Auto} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master /${ps_name}/M_AXI_GP0 \ + Slave /${acc_name}/s_axi_CTRL_BUS \ + intc_ip {New AXI Interconnect} \ + master_apm {0}" [get_bd_intf_pins ${acc_name}/s_axi_CTRL_BUS] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config " \ + Clk_master {Auto} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master /${acc_name}/m_axi_IN_BUS \ + Slave /${ps_name}/S_AXI_GP0 \ + intc_ip {Auto} \ + master_apm {0}" [get_bd_intf_pins ${ps_name}/S_AXI_GP0] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config " \ + Clk_master /${ps_name}/FCLK_CLK0 (100 MHz) \ + Clk_slave /${ps_name}/FCLK_CLK0 (100 MHz) \ + Clk_xbar /${ps_name}/FCLK_CLK0 (100 MHz) \ + Master /${acc_name}/m_axi_OUT_BUS \ + Slave /${ps_name}/S_AXI_GP0 \ + intc_ip {/axi_smc} \ + master_apm {0}" [get_bd_intf_pins ${acc_name}/m_axi_OUT_BUS] + +# Wiring interrupt signal +connect_bd_net [get_bd_pins ${acc_name}/interrupt] [get_bd_pins ${ps_name}/IRQ_F2P] + +# Top level wrapper +make_wrapper -files [get_files ./${project_name}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top +add_files -norecurse ./${project_name}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v + +# Memory mapping +delete_bd_objs [get_bd_addr_segs ${project_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_GP0_QSPI_LINEAR] +delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_GP0_IOP] +delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_GP0_M_AXI_GP0] +delete_bd_objs [get_bd_addr_segs ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_GP0_QSPI_LINEAR] +delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_GP0_IOP] +delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_GP0_M_AXI_GP0] + +# Run synthesis and implementation +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +# Reporting +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages + +# Export HDF file for SDK flow +file mkdir ./hdf +file copy -force ${project_name}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py index f979b60321..15b50fd2dd 100644 --- a/hls4ml/writer/vivado_accelerator_writer.py +++ b/hls4ml/writer/vivado_accelerator_writer.py @@ -82,11 +82,18 @@ def write_axi_wrapper(self, model): io_type = model.config.get_config_value("IOType") + model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram'] + for line in f.readlines(): if 'void myproject(' in line: newline = 'void {}_axi(\n'.format(model.config.get_project_name()) elif '//hls-fpga-machine-learning insert include' in line: newline = '#include "{}_axi.h"\n'.format(model.config.get_project_name()) + for b in model_brams: + newline += '#include "weights/{}.h"\n'.format(b.name) + newline += '\n' + if model_brams: + newline += '#include "nnet_utils/nnet_helpers.h"\n' elif '//hls-fpga-machine-learning insert local vars' in line: newline = '' if self.vivado_accelerator_config.get_interface() == 'axi_stream': @@ -102,8 +109,8 @@ def write_axi_wrapper(self, model): newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'\ .format(model.get_output_variables()[0].pragma[1]) elif '//hls-fpga-machine-learning insert call' in line: - newline = indent + '{}(in_local, out_local);\n'.format( - model.config.get_project_name()) + brams_str = (''.join([', ' + b.name for b in model_brams])) if len(model_brams) > 0 else ""; + newline = indent + '{}(in_local, out_local{});\n'.format(model.config.get_project_name(), brams_str) elif '//hls-fpga-machine-learning insert interface' in line: if self.vivado_accelerator_config.get_interface() == 'axi_lite': newline = '' @@ -124,6 +131,23 @@ def write_axi_wrapper(self, model): newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n' if model.config.get_config_value("IOType") == 'io_stream': newline += indent + '#pragma HLS DATAFLOW\n' + elif '//hls-fpga-machine-learning insert load weights' in line: + newline = '' + if model_brams: + newline += '#ifndef __SYNTHESIS__\n' + newline += indent + 'static bool loaded_weights = false;\n' + newline += indent + 'if (!loaded_weights) {\n' + newline += indent + ' loaded_weights = true;\n' + for layer in model.get_layers(): + for w in layer.get_weights(): + if w.weight_class == 'CompressedWeightVariable': + newline += indent + ' nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(w.type.name, w.nonzeros, w.name, w.name) + elif w.weight_class == 'ExponentWeightVariable': + newline += indent + ' nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(w.type.name, w.data_length, w.name, w.name) + else: + newline += indent + ' nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(w.type.name, w.data_length, w.name, w.name) + newline += indent + '}\n' + newline += '#endif\n' elif '//hls-fpga-machine-learning insert enqueue' in line: io_type = model.config.get_config_value("IOType") if io_type == 'io_parallel': @@ -139,10 +163,12 @@ def write_axi_wrapper(self, model): newline += indent + '}\n' elif io_type == 'io_stream': newline = '' + newline += 'LOAD_INPUT_OUTER_LOOP:\n' newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n' # newline += indent + indent + '#pragma HLS PIPELINE\n' newline += indent + indent + '{input_t} ctype;\n' newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n' + newline += 'LOAD_INPUT_INNER_LOOP:\n' newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' if self.vivado_accelerator_config.get_interface() == 'axi_stream': @@ -169,9 +195,11 @@ def write_axi_wrapper(self, model): newline += indent + '}\n' elif io_type == 'io_stream': newline = '' + newline += 'STORE_OUTPUT_OUTER_LOOP:\n' newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n' # newline += indent + indent + '#pragma HLS PIPELINE\n' newline += indent + indent + '{result_t} ctype = out_local.read();\n' + newline += 'STORE_OUTPUT_INNER_LOOP:\n' newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n' # newline += indent + indent + indent + '#pragma HLS UNROLL\n' if self.vivado_accelerator_config.get_interface() == 'axi_stream': @@ -188,6 +216,35 @@ def write_axi_wrapper(self, model): f.close() fout.close() + def modify_project_cpp(self, model): + ''' + Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function + ''' + filedir = os.path.dirname(os.path.abspath(__file__)) + oldfile = '{}/firmware/{}.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()) + newfile = '{}/build_prj_axi.tcl'.format(model.config.get_output_dir()) + f = open(oldfile, 'r') + fout = open(newfile, 'w') + + for line in f.readlines(): + if '#pragma HLS INTERFACE axis port=' in line: + newline = '' + elif '#pragma HLS INTERFACE bram port=' in line: + newline = '' + elif 'nnet::load_weights_from_txt' in line: + newline = '' + elif 'nnet::load_exponent_weights_from_txt' in line: + newline = '' + elif 'nnet::load_compressed_weights_from_txt' in line: + newline = '' + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + def modify_build_script(self, model): ''' Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function @@ -369,6 +426,7 @@ def write_hls(self, model): self.write_driver(model) self.write_wrapper_test(model) self.write_axi_wrapper(model) + self.modify_project_cpp(model) self.modify_build_script(model) self.write_new_tar(model) diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index bcf752b835..7ee61b8030 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -40,10 +40,10 @@ def print_array_to_cpp(self, var, odir, write_txt_file=True): if write_txt_file: h_file.write("#ifndef __SYNTHESIS__\n") - h_file.write(var.definition_cpp() + ";\n") + h_file.write("static " + var.definition_cpp() + ";\n") h_file.write("#else\n") - h_file.write(var.definition_cpp() + " = {") + h_file.write("static " + var.definition_cpp() + " = {") # fill c++ array. # not including internal brackets for multidimensional case