diff --git a/hls4ml/backends/vivado_accelerator/supported_boards.json b/hls4ml/backends/vivado_accelerator/supported_boards.json
index 1279ec22d0..08b4485bf0 100644
--- a/hls4ml/backends/vivado_accelerator/supported_boards.json
+++ b/hls4ml/backends/vivado_accelerator/supported_boards.json
@@ -1,8 +1,8 @@
 {
   "pynq-z2": {
     "part": "xc7z020clg400-1",
-    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl", "axi_master":  "axi_master_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py", "axi_master":  "axi_master_driver.py"},
     "c_drivers": {}
   },
   "zcu102": {
diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp
index 7a06633e58..f3490739aa 100644
--- a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp
+++ b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp
@@ -9,6 +9,8 @@ void myproject(
 
     //hls-fpga-machine-learning insert local vars
 
+    //hls-fpga-machine-learning insert load weights
+
     //hls-fpga-machine-learning insert enqueue
 
     //hls-fpga-machine-learning insert call
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/python_drivers/axi_master_driver.py b/hls4ml/templates/vivado_accelerator/pynq-z2/python_drivers/axi_master_driver.py
new file mode 100644
index 0000000000..c0a197e91e
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/python_drivers/axi_master_driver.py
@@ -0,0 +1,76 @@
+from pynq import DefaultHierarchy, DefaultIP, allocate
+from pynq import Overlay
+from datetime import datetime
+import pynq.lib.dma
+import numpy as np
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False,
+                 device=None):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.regin = self.myproject_axi_0.register_map.in_r.address
+        self.regout = self.myproject_axi_0.register_map.out_r.address
+        self.ctrl = self.myproject_axi_0.register_map.CTRL
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = (timeb - timea)
+        dts = dt.seconds + dt.microseconds * 10 ** -6
+        rate = N / dts
+        print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate))
+        return dts, rate
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.myproject_axi_0.write(self.regin, self.input_buffer.physical_address)
+        self.myproject_axi_0.write(self.regout, self.output_buffer.physical_address)
+        self.myproject_axi_0.write(self.ctrl.AP_START, 0x1)
+        if debug:
+            print("Config OK")
+        while not self.ctrl.AP_DONE:
+            if debug:
+                print("Polling...")
+        if debug:
+            print("Done OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl
new file mode 100644
index 0000000000..2dc70f81b7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl
@@ -0,0 +1,93 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+# Project names
+set design_name "design_1"
+set hls_solution_name "solution1"
+set ps_name "processing_system7_0"
+set acc_name "${project_name}_axi_0"
+
+# Board and chip part names
+create_project ${project_name} ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force
+set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+
+# Create block design
+create_bd_design ${design_name}
+
+# Setup IP repo
+#set_property  ip_repo_paths ${project_name}_prj [current_project]
+set_property  ip_repo_paths ${project_name}_prj/${hls_solution_name}/impl/ip [current_project]
+update_ip_catalog
+
+# Create and setup PS
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 ${ps_name}
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config " \
+    make_external {FIXED_IO, DDR} \
+    apply_board_preset {1} \
+    Master {Disable} \
+    Slave {Disable} " [get_bd_cells ${ps_name}]
+set_property -dict [list \
+    CONFIG.PCW_USE_S_AXI_GP0 {1} \
+    CONFIG.PCW_USE_FABRIC_INTERRUPT {1} \
+    CONFIG.PCW_IRQ_F2P_INTR {1}\
+    ] [get_bd_cells ${ps_name}]
+
+# Create accelerator
+create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name}
+
+# Wiring
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config " \
+    Clk_master {Auto} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master /${ps_name}/M_AXI_GP0 \
+    Slave /${acc_name}/s_axi_CTRL_BUS \
+    intc_ip {New AXI Interconnect} \
+    master_apm {0}" [get_bd_intf_pins ${acc_name}/s_axi_CTRL_BUS]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config " \
+    Clk_master {Auto} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master /${acc_name}/m_axi_IN_BUS \
+    Slave /${ps_name}/S_AXI_GP0 \
+    intc_ip {Auto} \
+    master_apm {0}" [get_bd_intf_pins ${ps_name}/S_AXI_GP0]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config " \
+    Clk_master /${ps_name}/FCLK_CLK0 (100 MHz) \
+    Clk_slave /${ps_name}/FCLK_CLK0 (100 MHz) \
+    Clk_xbar /${ps_name}/FCLK_CLK0 (100 MHz) \
+    Master /${acc_name}/m_axi_OUT_BUS \
+    Slave /${ps_name}/S_AXI_GP0 \
+    intc_ip {/axi_smc} \
+    master_apm {0}" [get_bd_intf_pins ${acc_name}/m_axi_OUT_BUS]
+
+# Wiring interrupt signal
+connect_bd_net [get_bd_pins ${acc_name}/interrupt] [get_bd_pins ${ps_name}/IRQ_F2P]
+
+# Top level wrapper
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top
+add_files -norecurse ./${project_name}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v
+
+# Memory mapping
+delete_bd_objs [get_bd_addr_segs ${project_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_GP0_QSPI_LINEAR]
+delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_GP0_IOP]
+delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_GP0_M_AXI_GP0]
+delete_bd_objs [get_bd_addr_segs ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_GP0_QSPI_LINEAR]
+delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_GP0_IOP]
+delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_GP0_M_AXI_GP0]
+
+# Run synthesis and implementation
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+# Reporting
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+
+# Export HDF file for SDK flow
+file mkdir ./hdf
+file copy -force ${project_name}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf
diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py
index f979b60321..15b50fd2dd 100644
--- a/hls4ml/writer/vivado_accelerator_writer.py
+++ b/hls4ml/writer/vivado_accelerator_writer.py
@@ -82,11 +82,18 @@ def write_axi_wrapper(self, model):
 
         io_type = model.config.get_config_value("IOType")
 
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
         for line in f.readlines():
             if 'void myproject(' in line:
                 newline = 'void {}_axi(\n'.format(model.config.get_project_name())
             elif '//hls-fpga-machine-learning insert include' in line:
                 newline = '#include "{}_axi.h"\n'.format(model.config.get_project_name())
+                for b in model_brams:
+                    newline += '#include "weights/{}.h"\n'.format(b.name)
+                newline += '\n'
+                if model_brams:
+                    newline += '#include "nnet_utils/nnet_helpers.h"\n'
             elif '//hls-fpga-machine-learning insert local vars' in line:
                 newline = ''
                 if self.vivado_accelerator_config.get_interface() == 'axi_stream':
@@ -102,8 +109,8 @@ def write_axi_wrapper(self, model):
                     newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'\
                         .format(model.get_output_variables()[0].pragma[1])
             elif '//hls-fpga-machine-learning insert call' in line:
-                newline = indent + '{}(in_local, out_local);\n'.format(
-                    model.config.get_project_name())
+                brams_str = (''.join([', ' + b.name for b in model_brams])) if len(model_brams) > 0 else "";
+                newline = indent + '{}(in_local, out_local{});\n'.format(model.config.get_project_name(), brams_str)
             elif '//hls-fpga-machine-learning insert interface' in line:
                 if self.vivado_accelerator_config.get_interface() == 'axi_lite':
                     newline = ''
@@ -124,6 +131,23 @@ def write_axi_wrapper(self, model):
                     newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
                     if model.config.get_config_value("IOType") == 'io_stream':
                         newline += indent + '#pragma HLS DATAFLOW\n'
+            elif '//hls-fpga-machine-learning insert load weights' in line:
+                newline = ''
+                if model_brams:
+                    newline += '#ifndef __SYNTHESIS__\n'
+                    newline += indent + 'static bool loaded_weights = false;\n'
+                    newline += indent + 'if (!loaded_weights) {\n'
+                    newline += indent + '    loaded_weights = true;\n'
+                    for layer in model.get_layers():
+                        for w in layer.get_weights():
+                            if w.weight_class == 'CompressedWeightVariable':
+                                newline += indent + '    nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(w.type.name, w.nonzeros, w.name, w.name)
+                            elif w.weight_class == 'ExponentWeightVariable':
+                                newline += indent + '    nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(w.type.name, w.data_length, w.name, w.name)
+                            else:
+                                newline += indent + '    nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(w.type.name, w.data_length, w.name, w.name)
+                    newline += indent + '}\n'
+                    newline += '#endif\n'
             elif '//hls-fpga-machine-learning insert enqueue' in line:
                 io_type = model.config.get_config_value("IOType")
                 if io_type == 'io_parallel':
@@ -139,10 +163,12 @@ def write_axi_wrapper(self, model):
                     newline += indent + '}\n'
                 elif io_type == 'io_stream':
                     newline = ''
+                    newline += 'LOAD_INPUT_OUTER_LOOP:\n'
                     newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
                     # newline += indent + indent + '#pragma HLS PIPELINE\n'
                     newline += indent + indent + '{input_t} ctype;\n'
                     newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n'
+                    newline += 'LOAD_INPUT_INNER_LOOP:\n'
                     newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
                     if self.vivado_accelerator_config.get_interface() == 'axi_stream':
@@ -169,9 +195,11 @@ def write_axi_wrapper(self, model):
                     newline += indent + '}\n'
                 elif io_type == 'io_stream':
                     newline = ''
+                    newline += 'STORE_OUTPUT_OUTER_LOOP:\n'
                     newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n'
                     # newline += indent + indent + '#pragma HLS PIPELINE\n'
                     newline += indent + indent + '{result_t} ctype = out_local.read();\n'
+                    newline += 'STORE_OUTPUT_INNER_LOOP:\n'
                     newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
                     # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
                     if self.vivado_accelerator_config.get_interface() == 'axi_stream':
@@ -188,6 +216,35 @@ def write_axi_wrapper(self, model):
         f.close()
         fout.close()
 
+    def modify_project_cpp(self, model):
+        '''
+        Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function
+        '''
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        oldfile = '{}/firmware/{}.cpp'.format(model.config.get_output_dir(), model.config.get_project_name())
+        newfile = '{}/build_prj_axi.tcl'.format(model.config.get_output_dir())
+        f = open(oldfile, 'r')
+        fout = open(newfile, 'w')
+
+        for line in f.readlines():
+            if '#pragma HLS INTERFACE axis port=' in line:
+                newline = ''
+            elif '#pragma HLS INTERFACE bram port=' in line:
+                newline = ''
+            elif 'nnet::load_weights_from_txt' in line:
+                newline = ''
+            elif 'nnet::load_exponent_weights_from_txt' in line:
+                newline = ''
+            elif 'nnet::load_compressed_weights_from_txt' in line:
+                newline = ''
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
     def modify_build_script(self, model):
         '''
         Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function
@@ -369,6 +426,7 @@ def write_hls(self, model):
         self.write_driver(model)
         self.write_wrapper_test(model)
         self.write_axi_wrapper(model)
+        self.modify_project_cpp(model)
         self.modify_build_script(model)
         self.write_new_tar(model)
 
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index bcf752b835..7ee61b8030 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -40,10 +40,10 @@ def print_array_to_cpp(self, var, odir, write_txt_file=True):
 
         if write_txt_file:
             h_file.write("#ifndef __SYNTHESIS__\n")
-            h_file.write(var.definition_cpp() + ";\n")
+            h_file.write("static " + var.definition_cpp() + ";\n")
             h_file.write("#else\n")
 
-        h_file.write(var.definition_cpp() + " = {")
+        h_file.write("static " + var.definition_cpp() + " = {")
 
         # fill c++ array.
         # not including internal brackets for multidimensional case