diff --git a/hls4ml/backends/vivado_accelerator/supported_boards.json b/hls4ml/backends/vivado_accelerator/supported_boards.json
index 1279ec22d0..382ae8b1fd 100644
--- a/hls4ml/backends/vivado_accelerator/supported_boards.json
+++ b/hls4ml/backends/vivado_accelerator/supported_boards.json
@@ -38,5 +38,11 @@
     "python_drivers": {"axi_stream": "axi_stream_driver.py"},
     "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
     "c_drivers": {}
+  },
+  "ultra96v2": {
+    "part": "xczu3eg-sbva484-1-e",
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl", "axi_master": "axi_master_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_master_driver.py"},
+    "c_drivers": { "axi_master": "axi_master_design.c"}
   }
 }
diff --git a/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py b/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py
index f9c7848ef2..230bfb849e 100644
--- a/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py
+++ b/hls4ml/backends/vivado_accelerator/vivado_accelerator_config.py
@@ -128,14 +128,22 @@ def get_clock_period(self):
     def get_driver_path(self):
         if  self.board.startswith('alveo'):
             return '../templates/vivado_accelerator/' + 'alveo/' + self.driver + '_drivers/' + \
-               self.get_driver_file()
+               self.get_driver_files()
         else:
             return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + \
-               self.get_driver_file()
-
-    def get_driver_file(self):
-        driver_ext = '.py' if self.driver == 'python' else '.h'
-        return self.interface + '_driver' + driver_ext
+               self.get_driver_files()
+
+    #def get_driver_file(self):
+    #    driver_ext = '.py' if self.driver == 'python' else '.h'
+    #    return self.interface + '_driver' + driver_ext
+
+    def get_driver_files(self):
+        if self.driver == 'c':
+            driver_dir = 'sdk'
+            return driver_dir
+        elif self.driver == 'python':
+            driver_ext = '.py'
+            return self.interface + '_driver' + driver_ext
 
     def get_krnl_rtl_src_dir(self):
         return '../templates/vivado_accelerator/' + 'alveo/' + '/krnl_rtl_src'
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 8bcb832cef..39dd7a3f53 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -604,6 +604,11 @@ def compile(self):
         self._top_function_lib = ctypes.cdll.LoadLibrary(lib_name)
 
     def _get_top_function(self, x):
+
+        io_type = self.config.get_config_value('IOType')
+        interface = self.config.get_config_value('AcceleratorConfig')['Interface'] if self.config.get_config_value('AcceleratorConfig') else None
+        config_weights = (io_type == 'io_stream') and (interface == 'axi_master')
+
         if self._top_function_lib is None:
             raise Exception('Model not compiled')
         if len(self.get_input_variables()) == 1:
@@ -611,7 +616,8 @@ def _get_top_function(self, x):
         else: 
             xlist = x
         n_outputs = len(self.get_output_variables())
-        
+        n_weights = len(self.get_weight_variables())
+
         for xi in xlist:
             if not isinstance(xi, np.ndarray):
                 raise Exception('Expected numpy.ndarray, but got {}'.format(type(x)))
@@ -628,9 +634,9 @@ def _get_top_function(self, x):
         else:
             raise Exception('Invalid type ({}) of numpy array. Supported types are: single, float32, double, float64, float_.'.format(x0.dtype))
 
-
         top_function.restype = None
-        top_function.argtypes = [npc.ndpointer(ctype, flags="C_CONTIGUOUS") for i in range(len(xlist) + n_outputs)]
+        top_function.argtypes = [npc.ndpointer(ctype, flags="C_CONTIGUOUS") \
+                for i in range(len(xlist) + (n_weights if config_weights else 0) + n_outputs)]
 
         return top_function, ctype
 
@@ -654,10 +660,16 @@ def _compute_n_samples(self, x):
         return int(n_sample)
 
     def predict(self, x):
+
+        io_type = self.config.get_config_value('IOType')
+        interface = self.config.get_config_value('AcceleratorConfig')['Interface'] if self.config.get_config_value('AcceleratorConfig') else None
+        config_weights = (io_type == 'io_stream') and (interface == 'axi_master')
+
         top_function, ctype = self._get_top_function(x)
         n_samples = self._compute_n_samples(x)
         n_inputs = len(self.get_input_variables())
         n_outputs = len(self.get_output_variables())
+        n_weights = len(self.get_weight_variables())
 
         curr_dir = os.getcwd()
         os.chdir(self.config.get_output_dir() + '/firmware')
@@ -675,10 +687,16 @@ def predict(self, x):
                     inp = [np.asarray(xj[i]) for xj in x]
                 argtuple = inp
                 argtuple += predictions
+                if config_weights:
+                    for j in range(n_weights):
+                        weights = [float(w) for w in self.get_weight_variables()[j]]
+                        argtuple += [np.asarray(weights)]
                 argtuple = tuple(argtuple)
                 top_function(*argtuple)
-                output.append(predictions)
-
+                if config_weights and n_samples == 1 and n_inputs:
+                    output.append([predictions])
+                else:
+                    output.append(predictions)
 
             # Convert to list of numpy arrays (one for each output)
             output = [np.asarray([output[i_sample][i_output] for i_sample in range(n_samples)]) for i_output in range(n_outputs)]
diff --git a/hls4ml/model/optimizer/passes/nop.py b/hls4ml/model/optimizer/passes/nop.py
index daf3e71fc4..fae9bbcfbc 100644
--- a/hls4ml/model/optimizer/passes/nop.py
+++ b/hls4ml/model/optimizer/passes/nop.py
@@ -6,7 +6,7 @@ def match(self, node):
         cast = False
         if isinstance(node, Activation):
             cast = node.get_input_variable().type.precision != node.get_output_variable().type.precision
-        return isinstance(node, Activation) and node.get_attr('activation') == 'linear' and not cast
+        return isinstance(node, Activation) and node.get_attr('activation') == 'linear' # and not cast
     
     def transform(self, model, node):
         model.remove_node(node)
diff --git a/hls4ml/templates/vivado/myproject_test.cpp b/hls4ml/templates/vivado/myproject_test.cpp
index 7de8dd4b4a..8a28926cd1 100644
--- a/hls4ml/templates/vivado/myproject_test.cpp
+++ b/hls4ml/templates/vivado/myproject_test.cpp
@@ -22,8 +22,9 @@
 #include <vector>
 #include <map>
 #include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
+#include <cstdlib>
+#include <cmath>
+#include <cfloat>
 
 #include "firmware/myproject.h"
 #include "firmware/nnet_utils/nnet_helpers.h"
@@ -56,6 +57,10 @@ int main(int argc, char **argv)
   std::string pline;
   int e = 0;
 
+  //hls-fpga-machine-learning insert weights
+
+  //hls-fpga-machine-learning insert load weights
+
   if (fin.is_open() && fpr.is_open()) {
     while ( std::getline(fin,iline) && std::getline (fpr,pline) ) {
       if (e % CHECKPOINT == 0) std::cout << "Processing input " << e << std::endl;
diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp
index 7a06633e58..519a36194c 100644
--- a/hls4ml/templates/vivado_accelerator/myproject_axi.cpp
+++ b/hls4ml/templates/vivado_accelerator/myproject_axi.cpp
@@ -1,14 +1,17 @@
 //hls-fpga-machine-learning insert include
 
 void myproject(
-    input_axi_t in[N_IN],
-    output_axi_t out[N_OUT]
-        ){
+    input_axi_t in[N_IN]
+    , output_axi_t out[N_OUT]
+    //hls-fpga-machine-learning insert weights
+    ){
 
     //hls-fpga-machine-learning insert interface
 
     //hls-fpga-machine-learning insert local vars
 
+    //hls-fpga-machine-learning insert enqueue weights
+
     //hls-fpga-machine-learning insert enqueue
 
     //hls-fpga-machine-learning insert call
diff --git a/hls4ml/templates/vivado_accelerator/myproject_axi.h b/hls4ml/templates/vivado_accelerator/myproject_axi.h
index fe3dbc5cde..3d31fa0a8f 100644
--- a/hls4ml/templates/vivado_accelerator/myproject_axi.h
+++ b/hls4ml/templates/vivado_accelerator/myproject_axi.h
@@ -7,7 +7,8 @@
 //hls-fpga-machine-learning insert definitions
 
 void myproject(
-    input_axi_t in[N_IN],
-    output_axi_t out[N_OUT]
-        );
+    input_axi_t in[N_IN]
+    , output_axi_t out[N_OUT]
+    //hls-fpga-machine-learning insert weights
+    );
 #endif
diff --git a/hls4ml/templates/vivado_accelerator/standalone_main.c b/hls4ml/templates/vivado_accelerator/standalone_main.c
new file mode 100644
index 0000000000..208985b1bf
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/standalone_main.c
@@ -0,0 +1,269 @@
+/**
+ *
+ * Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+ *
+ */
+
+#include "xmyproject_axi.h"  /* TODO: design-dependent name */
+#include "stdio.h"       /* PRINTF */
+#include "unistd.h"      /* sleep */
+#include "stdlib.h"
+#include "malloc.h"
+#include "assert.h"
+#include "xil_io.h"      /* peripheral read/write wrappers */
+#include "xtime_l.h"     /* to measure performance of the system */
+#include "platform.h"    /* platform init/cleanup functions */
+#include "xil_cache.h"   /* enable/disable caches etc */
+#include "xil_printf.h"  /* UART debug print functions */
+#include "xparameters.h" /* peripherals base addresses */
+
+#include "data.h"
+
+//#define __DEBUG__
+
+#define MAX_PRINT_ELEMENTS (16)
+
+#define PRINTF printf
+
+const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS;
+const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS;
+
+#if 1
+/* Accelerator verification */
+#define REFERENCE_OUTPUTS data_y_hls_outputs
+#else
+/* Accelerator validation */
+#define REFERENCE_OUTPUTS data_y_outputs
+//#define REFERENCE_OUTPUTS data_y_keras_outputs
+#endif
+
+unsigned get_max(float *data, unsigned n_elements) {
+	float max_value = 0.0;
+	unsigned max_index = 0;
+	for (unsigned i = 0; i < n_elements; i++)
+		if (data[i] >= max_value) {
+			max_index = i;
+			max_value = data[i];
+		}
+	return max_index;
+}
+
+float *inputs_mem = NULL;
+float *outputs_mem = NULL;
+float *reference_mem = NULL;
+
+/* Accelerator configuration */
+XMyproject_axi accelerator; /* TODO: design-dependent name */
+XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */
+
+/* Accelerator initialization routine */
+void init_accelerators() {
+    PRINTF("INFO: Initializing accelerator\r\n");
+    accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_0_DEVICE_ID); /* TODO: design-dependent name */
+    if (accelerator_cfg) {
+        int status  = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */
+        if (status != XST_SUCCESS) {
+            PRINTF("ERROR: Initializing accelerator\r\n");
+        }
+    }
+}
+
+/* Reference implementation of the accelerator in software */
+int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) {
+#ifdef __DEBUG__
+	PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n");
+#endif
+    /* See data.h for inputs and outputs */
+    for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) {
+    	sw_outputs_mem[i] = REFERENCE_OUTPUTS[i];
+    }
+    return 0;
+}
+
+/* Profiling function */
+double get_elapsed_time(XTime start, XTime stop) {
+    return 1.0 * (stop - start) / (COUNTS_PER_SECOND);
+}
+
+/* Dump data to the console */
+void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) {
+	PRINTF("INFO:   %s[%u][%u]:\r\n", label, n_samples, feature_count);
+    /* Print at most MAX_PRINT_ELEMENTS */
+    for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) {
+    	PRINTF("INFO:     [%u] ", i);
+        for (unsigned j = 0; j < feature_count; j++) {
+        	unsigned index = i * feature_count + j;
+        	PRINTF("%f ", data[index]);
+        }
+        PRINTF("\r\n");
+    }
+}
+
+/* The top of the hill :-) */
+int main(int argc, char** argv) {
+
+    XTime start, stop;
+    double calibration_time;
+    double sw_elapsed = 0;
+    double hw_elapsed = 0;
+    double cache_elapsed = 0;
+    unsigned hw_errors;
+
+    char __attribute__ ((unused)) dummy; /* dummy input */
+
+    /* Initialize platform (uart and caches) */
+    init_platform();
+
+    PRINTF("\r\n");
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */
+    PRINTF("INFO: ==================================================\r\n");
+
+    init_accelerators();
+
+    inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float));
+    outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+    reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+
+    /* Calibration */
+    XTime_GetTime(&start);
+    sleep(1);
+    XTime_GetTime(&stop);
+    calibration_time = get_elapsed_time(start, stop);
+    PRINTF("INFO: Time calibration for one second (%lf sec)\r\n", calibration_time);
+
+    /* Initialize memory */
+    PRINTF("INFO: Initialize memory\r\n");
+    PRINTF("INFO:   - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */
+    PRINTF("INFO:   - Inputs count: %u\r\n", N_X_INPUTS);
+    PRINTF("INFO:   - Outputs count: %u\r\n", N_Y_OUTPUTS);
+    PRINTF("INFO:   - Data size: %u B\r\n", sizeof(float));
+    PRINTF("INFO:   - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+    PRINTF("INFO:   - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+
+    // Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+    //malloc_stats();
+
+    for (int i = 0; i < INPUT_N_ELEMENTS; i++) {
+        inputs_mem[i] = data_X_inputs[i];
+    }
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        outputs_mem[i] = 0x0;
+    }
+
+    /* ****** SW REFERENCE ****** */
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Start SW reference implementation\r\n");
+    XTime_GetTime(&start);
+    sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS);
+    XTime_GetTime(&stop);
+    sw_elapsed = get_elapsed_time(start, stop);
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Press any key to start:\r\n");
+    dummy = inbyte();
+    //PRINTF("INFO:");
+
+    /* ****** HW ACCELERATOR ****** */
+    PRINTF("INFO: Start HW accelerator\r\n");
+
+    XTime_GetTime(&start);
+    Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    XTime_GetTime(&stop);
+    cache_elapsed = get_elapsed_time(start, stop);
+
+    /*hls-fpga-machine-learning insert configure weights*/
+
+    /*hls-fpga-machine-learning insert load weights on*/
+
+    /*hls-fpga-machine-learning insert start and wait*/
+
+
+    for (unsigned j = 0; j < N_SAMPLES; j++) {
+    	float *inputs_mem_i = inputs_mem + j * N_X_INPUTS;
+    	float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS;
+
+    	/* Configure the accelerator */
+    	XTime_GetTime(&start);
+        XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */
+    	XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */
+
+        /*hls-fpga-machine-learning insert load weights off*/
+
+    	XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */
+    	while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */
+
+    	/* Get error status */
+    	//hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */
+    	XTime_GetTime(&stop);
+    	hw_elapsed += get_elapsed_time(start, stop);
+    }
+
+    XTime_GetTime(&start);
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    XTime_GetTime(&stop);
+    cache_elapsed += get_elapsed_time(start, stop);
+
+    PRINTF("INFO: HW accelerator done!\r\n");
+
+    /* ****** VALIDATION ****** */
+    PRINTF("INFO: ================== Verification ==================\r\n");
+#ifdef __DEBUG__
+    PRINTF("INFO: Dump data\r\n");
+    dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS);
+    dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS);
+    dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS);
+#endif
+
+#ifdef __DEBUG__
+    PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed);
+#endif
+    PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES);
+    PRINTF("INFO:   - total %f sec\r\n", hw_elapsed);
+    PRINTF("INFO:   - per-inference %.12f sec (%f ns)\r\n", hw_elapsed / (N_SAMPLES), (hw_elapsed*1000.0) / (N_SAMPLES));
+    PRINTF("INFO: Cache flush time: %f sec\r\n", cache_elapsed);
+#ifdef __DEBUG__
+    PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed));
+#endif
+
+    hw_errors = 0;
+#if 1
+    /* Accelerator verification */
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        if (outputs_mem[i] != reference_mem[i]) {
+            PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]);
+            hw_errors++;
+        }
+    }
+    PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS);
+    if (hw_errors > 0)
+        PRINTF("INFO: Verification: FAIL\r\n");
+    else
+        PRINTF("INFO: Verification: PASS!\r\n");
+#else
+    /* Accelerator validation */
+    for (unsigned s = 0; s < N_SAMPLES; s++) {
+    	unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+    	unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+    	if (hw_digit != ref_digit) {
+#ifdef __DEBUG__
+    		PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit);
+#endif
+    	    hw_errors++;
+    	}
+    }
+    float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0;
+    float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0);
+    PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES);
+    PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate);
+    PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy);
+#endif
+    PRINTF("INFO: ==================================================\r\n");
+
+    cleanup_platform();
+
+    return 0;
+}
+
+
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c
new file mode 100644
index 0000000000..8a46df8bde
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c
@@ -0,0 +1,6 @@
+#include "xil_printf.h"
+
+int main(void) {
+    xil_printf("Hello world!\r\n");
+    return 0;
+}
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h
new file mode 100644
index 0000000000..8a46df8bde
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h
@@ -0,0 +1,6 @@
+#include "xil_printf.h"
+
+int main(void) {
+    xil_printf("Hello world!\r\n");
+    return 0;
+}
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile
new file mode 100644
index 0000000000..03ab9b8de7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile
@@ -0,0 +1,33 @@
+DESIGN := design_1
+
+help:
+	@echo "INFO: make <TAB> to show targets"
+.PHONY: help
+
+--setup:
+	xsct ./setup.tcl $(DESIGN)
+.PHONY: --setup
+
+sdk: --setup
+	rm -f $(DESIGN)_standalone/src/helloworld.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h
+.PHONY: sdk
+
+gui:
+	xsdk --workspace . &
+.PHONY: gui
+
+clean:
+	rm -rf $(DESIGN)_platform
+	rm -rf $(DESIGN)_standalone
+	rm -rf $(DESIGN)_standalone_bsp
+	rm -rf RemoteSystemsTempFiles
+	rm -rf .Xil
+	rm -rf .metadata
+	rm -f *.log
+.PHONY: clean
+
+ultraclean: clean
+	rm -rf hdf/*.hdf
+.PHONY: ultraclean
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl
new file mode 100644
index 0000000000..ea386d4281
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl
@@ -0,0 +1,18 @@
+# See 
+# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html
+
+setws .
+if { $::argc == 1 } {
+    set myproject [lindex $::argv 0]
+    createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf
+    createapp -name ${myproject}\_standalone -app {Hello World} -proc psu_cortexa53_0 -hwproject ${myproject}\_platform -os standalone -arch 64
+    configbsp -bsp ${myproject}\_standalone_bsp stdin psu_uart_1
+    configbsp -bsp ${myproject}\_standalone_bsp stdout psu_uart_1
+    updatemss -mss ${myproject}\_standalone_bsp/system.mss
+    regenbsp -bsp ${myproject}\_standalone_bsp 
+    configapp -app ${myproject}\_standalone build-config release
+    configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000}
+    configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000}
+    projects -build
+    #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE}
+}
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_master_driver.py b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_master_driver.py
new file mode 100644
index 0000000000..7e2419b8f5
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_master_driver.py
@@ -0,0 +1,136 @@
+from pynq import DefaultHierarchy, DefaultIP, allocate
+from pynq import Overlay
+from datetime import datetime
+import pynq.lib.dma
+import numpy as np
+
+
+class NeuralNetworkOverlay(Overlay):
+    #def __init__(self, bitfile_name, x_shape, y_shape, w2_shape, b2_shape, w5_shape, b5_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None):
+    #hls-fpga-machine-learning insert init
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.regin = self.myproject_axi_0.register_map.in_r.address
+        self.regout = self.myproject_axi_0.register_map.out_r.address
+        #
+        #self.regw2 = self.myproject_axi_0.register_map.w2.address
+        #self.regb2 = self.myproject_axi_0.register_map.b2.address
+        #self.regw5 = self.myproject_axi_0.register_map.w5.address
+        #self.regb5 = self.myproject_axi_0.register_map.b5.address
+        #hls-fpga-machine-learning insert registers
+        self.reglw = self.myproject_axi_0.register_map.load_weights.address
+        #
+        self.ctrl = self.myproject_axi_0.register_map.CTRL
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+        #self.w2_buffer = allocate(shape=w2_shape, dtype=dtype)
+        #self.b2_buffer = allocate(shape=b2_shape, dtype=dtype)
+        #self.w5_buffer = allocate(shape=w5_shape, dtype=dtype)
+        #self.b5_buffer = allocate(shape=b5_shape, dtype=dtype)
+        #hls-fpga-machine-learning insert buffers
+    def _print_dt(self, timea, timeb, N):
+        dt = (timeb - timea)
+        dts = dt.seconds + dt.microseconds * 10 ** -6
+        rate = N / dts
+        print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate))
+        return dts, rate
+
+#    def load_weights(self, w2, b2, w5, b5, debug=False, profile=False, encode=None):
+        #hls-fpga-machine-learning insert load weights
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - w*, b* : the weight and bias vectors. Should be numpy ndarray.
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode: function pointers. See `dtype` section for more information.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            #w2 = encode(w2)
+            #b2 = encode(b2)
+            #w5 = encode(w5)
+            #b5 = encode(b5)
+            #hls-fpga-machine-learning insert encode
+        #
+        #self.w2_buffer[:] = w2
+        #self.b2_buffer[:] = b2
+        #self.w5_buffer[:] = w5
+        #self.b5_buffer[:] = b5
+        #hls-fpga-machine-learning insert set buffers
+        #
+        #self.myproject_axi_0.write(self.regw2, self.w2_buffer.physical_address)
+        #self.myproject_axi_0.write(self.regb2, self.b2_buffer.physical_address)
+        #self.myproject_axi_0.write(self.regw5, self.w5_buffer.physical_address)
+        #self.myproject_axi_0.write(self.regb5, self.b5_buffer.physical_address)
+        #hls-fpga-machine-learning insert set registers
+        #
+        self.myproject_axi_0.write(self.reglw, 0x1)
+        #
+        self.myproject_axi_0.write(self.ctrl.AP_START, 0x1)
+        if debug:
+            print("Config OK")
+        while not self.ctrl.AP_DONE:
+            if debug:
+                print("Polling...")
+        if debug:
+            print("Done OK")
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.myproject_axi_0.write(self.regin, self.input_buffer.physical_address)
+        self.myproject_axi_0.write(self.regout, self.output_buffer.physical_address)
+        #
+        self.myproject_axi_0.write(self.reglw, 0x0)
+        #
+        self.myproject_axi_0.write(self.ctrl.AP_START, 0x1)
+        if debug:
+            print("Config OK")
+        while not self.ctrl.AP_DONE:
+            if debug:
+                print("Polling...")
+        if debug:
+            print("Done OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..4adb187ab4
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py
@@ -0,0 +1,75 @@
+from pynq import DefaultHierarchy, DefaultIP, allocate
+from pynq import Overlay
+from datetime import datetime
+import pynq.lib.dma
+import numpy as np
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False,
+                 device=None):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.sendchannel = self.hier_0.axi_dma_0.sendchannel
+        self.recvchannel = self.hier_0.axi_dma_0.recvchannel
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = (timeb - timea)
+        dts = dt.seconds + dt.microseconds * 10 ** -6
+        rate = N / dts
+        print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate))
+        return dts, rate
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.sendchannel.transfer(self.input_buffer)
+        self.recvchannel.transfer(self.output_buffer)
+        if debug:
+            print("Transfer OK")
+        self.sendchannel.wait()
+        if debug:
+            print("Send OK")
+        self.recvchannel.wait()
+        if debug:
+            print("Receive OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
\ No newline at end of file
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl
new file mode 100644
index 0000000000..2df93afca5
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl
@@ -0,0 +1,26 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${myproject}_vivado_accelerator -part xczu3eg-sbva484-1-e -force
+
+set_property board_part em.avnet.com:ultra96:part0:1.2 [current_project]
+set_property  ip_repo_paths  ${myproject}_prj [current_project]
+update_ip_catalog
+
+# Create Block Designer design
+create_bd_design "design_1"
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells zynq_ultra_ps_e]
+create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${myproject}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins ${myproject}_axi_0/s_axi_AXILiteS]
+
+make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl
new file mode 100644
index 0000000000..0d5eb1a89e
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl
@@ -0,0 +1,100 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+# Project names
+set design_name "design_1"
+set hls_solution_name "solution1"
+set ps_name "zynq_ultra_ps_e_0"
+set acc_name "${project_name}_axi_0"
+
+# Board and chip part names
+create_project ${project_name} ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+set_property board_part avnet.com:ultra96v2:part0:1.2 [current_project]
+
+# Create block design
+create_bd_design ${design_name}
+
+# Setup IP repo
+#set_property  ip_repo_paths ${project_name}_prj [current_project]
+set_property  ip_repo_paths ${project_name}_prj/${hls_solution_name}/impl/ip [current_project]
+update_ip_catalog
+
+# Create and setup PS
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 ${ps_name}
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells ${ps_name}]
+set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells ${ps_name}]
+set_property -dict [list CONFIG.PSU__MAXIGP0__DATA_WIDTH {32} CONFIG.PSU__MAXIGP1__DATA_WIDTH {32}] [get_bd_cells ${ps_name}]
+
+# Create accelerator
+create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name}
+
+# Wiring
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {Auto} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master "/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD" \
+    Slave "/myproject_axi_0/s_axi_CTRL_BUS" \
+    intc_ip {New AXI Interconnect} \
+    master_apm {0}} [get_bd_intf_pins ${acc_name}/s_axi_CTRL_BUS]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {Auto} \
+    Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+    Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+    Master "/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD" \
+    Slave "/myproject_axi_0/s_axi_CTRL_BUS" \
+    intc_ip {/ps8_0_axi_periph} \
+    master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+   Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+   Clk_slave {Auto} \
+   Clk_xbar {Auto} \
+   Master "/myproject_axi_0/m_axi_IN_BUS" \
+   Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \
+   intc_ip {Auto} \
+   master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+   Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+   Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+   Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+   Master "/myproject_axi_0/m_axi_OUT_BUS" \
+   Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \
+   intc_ip {/axi_smc} \
+   master_apm {0}} [get_bd_intf_pins ${acc_name}/m_axi_OUT_BUS]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+    Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+    Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+    Master "/myproject_axi_0/m_axi_MODEL_BUS" \
+    Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \
+    intc_ip {/axi_smc} \
+    master_apm {0}} [get_bd_intf_pins myproject_axi_0/m_axi_MODEL_BUS]
+
+# Wiring interrupt signal
+connect_bd_net [get_bd_pins ${acc_name}/interrupt] [get_bd_pins ${ps_name}/pl_ps_irq0]
+
+# Top level wrapper
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top
+add_files -norecurse ./${project_name}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v
+
+# Memory mapping
+delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_HPC0_LPS_OCM]
+delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_HPC0_LPS_OCM]
+
+# Run synthesis and implementation
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+# Reporting
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+
+# Export HDF file for SDK flow
+file mkdir ./hdf
+file copy -force ${project_name}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..4721b59941
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,58 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${myproject}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+
+set_property board_part em.avnet.com:ultra96:part0:1.2 [current_project]
+set_property  ip_repo_paths  ${myproject}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+set_property  ip_repo_paths ${myproject}_prj/solution1/impl/ip [current_project]
+update_ip_catalog
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
+
+set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
+endgroup
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0
+endgroup
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${myproject}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${myproject}_axi_0/out_r]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${myproject}_axi_0/ap_clk]
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${myproject}_axi_0]
+
+make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py
index f979b60321..cbcd1d3000 100644
--- a/hls4ml/writer/vivado_accelerator_writer.py
+++ b/hls4ml/writer/vivado_accelerator_writer.py
@@ -17,6 +17,10 @@ def write_axi_wrapper(self, model):
         inp_axi_t, out_axi_t, inp, out = self.vivado_accelerator_config.get_corrected_types()
         indent = '    '
 
+        io_type = model.config.get_config_value('IOType')
+        interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None
+        config_weights = (io_type == 'io_stream') and (interface == 'axi_master')
+
         #######################
         ## myproject_axi.h
         #######################
@@ -32,6 +36,11 @@ def write_axi_wrapper(self, model):
                 newline = '#include "{}.h"\n'.format(model.config.get_project_name())
             elif 'void myproject(' in line:
                 newline = 'void {}_axi(\n'.format(model.config.get_project_name())
+            elif config_weights and '//hls-fpga-machine-learning insert weights' in line:
+                newline = ''
+                for v in model.get_weight_variables():
+                    newline += indent + ', model_axi_t {name} [{shape}]\n'.format(name=v.name, shape=v.data_length)
+                newline += ', char load_weights'
             elif '//hls-fpga-machine-learning insert definitions' in line:
                 newline = ''
                 newline += 'static const unsigned N_IN = {};\n'.format(inp.size())
@@ -66,6 +75,9 @@ def write_axi_wrapper(self, model):
                 else:
                     newline += 'typedef {} input_axi_t;\n'.format(inp_axi_t)
                     newline += 'typedef {} output_axi_t;\n'.format(out_axi_t)
+                    if config_weights:
+                        newline += 'typedef {} model_axi_t; // FIXME: Arbitrary choice type of the inputs and weights are the same\n'\
+                                .format(inp_axi_t)
             else:
                 newline = line
             fout.write(newline)
@@ -87,6 +99,11 @@ def write_axi_wrapper(self, model):
                 newline = 'void {}_axi(\n'.format(model.config.get_project_name())
             elif '//hls-fpga-machine-learning insert include' in line:
                 newline = '#include "{}_axi.h"\n'.format(model.config.get_project_name())
+            elif config_weights and '//hls-fpga-machine-learning insert weights' in line:
+                newline = ''
+                for v in model.get_weight_variables():
+                    newline += indent + ', model_axi_t {name} [{shape}]\n'.format(name=v.name, shape=v.data_length)
+                newline += indent + ', char load_weights'
             elif '//hls-fpga-machine-learning insert local vars' in line:
                 newline = ''
                 if self.vivado_accelerator_config.get_interface() == 'axi_stream':
@@ -101,9 +118,22 @@ def write_axi_wrapper(self, model):
                         .format(model.get_input_variables()[0].pragma[1])
                     newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'\
                         .format(model.get_output_variables()[0].pragma[1])
+                    if config_weights:
+                        newline += '\n'
+                        for v in model.get_weight_variables():
+                            newline += indent + 'static {dtype} {name}_local [{shape}];\n'.format(dtype=v.type.name, name=v.name, shape=v.data_length)
             elif '//hls-fpga-machine-learning insert call' in line:
-                newline = indent + '{}(in_local, out_local);\n'.format(
-                    model.config.get_project_name())
+                if config_weights:
+                    newline = ''
+                    weight_string=''
+                    for v in model.get_weight_variables():
+                        weight_string+=','+v.name+'_local'
+                    newline = indent + indent + '{}(in_local, out_local'.format(model.config.get_project_name())
+                    newline += weight_string
+                    newline += ');\n'
+                else:
+                    newline = indent + '{}(in_local, out_local);\n'.format(
+                        model.config.get_project_name())
             elif '//hls-fpga-machine-learning insert interface' in line:
                 if self.vivado_accelerator_config.get_interface() == 'axi_lite':
                     newline = ''
@@ -117,6 +147,12 @@ def write_axi_wrapper(self, model):
                         .format(model.get_input_variables()[0].pragma[1])
                     newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'\
                         .format(model.get_output_variables()[0].pragma[1])
+                    if config_weights:
+                        newline += '\n'
+                        for v in model.get_weight_variables():
+                            newline += indent + '#pragma HLS INTERFACE m_axi depth=1 port={} offset=slave bundle=MODEL_BUS\n'\
+                                .format(v.name)
+                        newline += indent + '#pragma HLS INTERFACE s_axilite port=load_weights bundle=CTRL_BUS\n'
                 elif self.vivado_accelerator_config.get_interface() == 'axi_stream':
                     newline = ''
                     newline += indent + '#pragma HLS INTERFACE axis port=in\n'
@@ -124,6 +160,15 @@ def write_axi_wrapper(self, model):
                     newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
                     if model.config.get_config_value("IOType") == 'io_stream':
                         newline += indent + '#pragma HLS DATAFLOW\n'
+            elif config_weights and '//hls-fpga-machine-learning insert enqueue weights' in line:
+                    newline = ''
+                    newline += indent + 'if (load_weights) {'
+                    for v in model.get_weight_variables():
+                        newline += indent + indent + 'for (unsigned i = 0; i < {shape}; i++)\n'\
+                                .format(shape=v.data_length)
+                        newline += indent + indent + indent + '{name}_local[i] = {name}[i];\n'\
+                                .format(dtype=v.type.name, name=v.name, shape=v.data_length)
+                    newline += indent + '} else {'
             elif '//hls-fpga-machine-learning insert enqueue' in line:
                 io_type = model.config.get_config_value("IOType")
                 if io_type == 'io_parallel':
@@ -138,6 +183,8 @@ def write_axi_wrapper(self, model):
                         newline += indent + indent + 'in_local[i] = in[i]; // Read input with cast\n'
                     newline += indent + '}\n'
                 elif io_type == 'io_stream':
+                    if config_weights:
+                        indent = indent + indent
                     newline = ''
                     newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
                     # newline += indent + indent + '#pragma HLS PIPELINE\n'
@@ -154,6 +201,8 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + 'in_local.write(ctype);\n'
                     newline += indent + '}}\n'
                     newline = newline.format(input_t=inp.type.name)
+                    if config_weights:
+                        indent = '    '
             elif '//hls-fpga-machine-learning insert dequeue' in line:
                 io_type = model.config.get_config_value("IOType")
                 if io_type == 'io_parallel':
@@ -168,6 +217,8 @@ def write_axi_wrapper(self, model):
                         newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n'
                     newline += indent + '}\n'
                 elif io_type == 'io_stream':
+                    if config_weights:
+                        indent = indent + indent
                     newline = ''
                     newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n'
                     # newline += indent + indent + '#pragma HLS PIPELINE\n'
@@ -182,6 +233,9 @@ def write_axi_wrapper(self, model):
                     newline += indent + indent + '}}\n'
                     newline += indent + '}}\n'
                     newline = newline.format(result_t=out.type.name)
+                    if config_weights:
+                        indent = '    '
+                        newline += indent + '}'
             else:
                 newline = line
             fout.write(newline)
@@ -232,6 +286,10 @@ def modify_build_script(self, model):
 
     def write_wrapper_test(self, model):
 
+        io_type = model.config.get_config_value('IOType')
+        interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None
+        config_weights = (io_type == 'io_stream') and (interface == 'axi_master')
+
         ###################
         # write myproject_test_wrapper.cpp
         ###################
@@ -256,7 +314,11 @@ def write_wrapper_test(self, model):
                 newline = ''
             elif '{}('.format(model.config.get_project_name()) in line:
                 indent_amount = line.split(model.config.get_project_name())[0]
-                newline = indent_amount + '{}_axi(inputs,outputs);\n'.format(model.config.get_project_name())
+                if config_weights:
+                    newline = line.replace(model.config.get_project_name(),model.config.get_project_name()+'_axi').\
+                            replace(inp.name,'inputs').replace(out.name,'outputs')
+                else:
+                    newline = indent_amount + '{}_axi(inputs,outputs);\n'.format(model.config.get_project_name())
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
                 newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name,
                                                                                                       'input_axi_t')
@@ -288,6 +350,7 @@ def write_wrapper_test(self, model):
 
         inp = model.get_input_variables()[0]
         out = model.get_output_variables()[0]
+        brams = model.get_weight_variables()
 
         for line in f.readlines():
             if '{}.h'.format(model.config.get_project_name()) in line:
@@ -301,8 +364,13 @@ def write_wrapper_test(self, model):
                                        'output_axi_t {}_ap[N_OUT]'.format(out.name))
             elif '{}('.format(model.config.get_project_name()) in line:
                 indent_amount = line.split(model.config.get_project_name())[0]
-                newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(model.config.get_project_name(), inp.name,
-                                                                          out.name)
+                if config_weights:
+                    newline = line.replace(model.config.get_project_name(),model.config.get_project_name()+'_axi')
+                    for b in brams:
+                        newline = newline.replace(b.name, b.name+'_ap')
+                else:
+                    newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(model.config.get_project_name(), inp.name,
+                                                                              out.name)
             elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
                 newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, 'input_axi_t')
             elif out.size_cpp() in line or out.name in line or out.type.name in line:
@@ -315,6 +383,71 @@ def write_wrapper_test(self, model):
         fout.close()
         os.rename(newfile, oldfile)
 
+    def modify_python_driver(self, model):
+
+        io_type = model.config.get_config_value('IOType')
+        interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None
+        driver = model.config.get_config_value('AcceleratorConfig')['Driver'] if model.config.get_config_value('AcceleratorConfig') else None
+        config_weights = (io_type == 'io_stream') and (interface == 'axi_master')
+
+        if driver == 'c' or io_type != 'io_stream' or interface != 'axi_master':
+            return;
+
+        ###################
+        # write axi_master_driver.py
+        ###################
+        oldfile = '{}/axi_master_driver.py'.format(model.config.get_output_dir(), model.config.get_project_name())
+        newfile = '{}/axi_master_driveri.NEW.py'.format(model.config.get_output_dir(), model.config.get_project_name())
+
+        f = open(oldfile, 'r')
+        fout = open(newfile, 'w')
+
+        indent = '    '
+        brams = model.get_weight_variables()
+
+        for line in f.readlines():
+            if '#hls-fpga-machine-learning insert init' in line:
+                newline = line
+                w_shapes = ''
+                for b in brams:
+                    w_shapes += b.name + '_shape, '
+                newline += indent + 'def __init__(self, bitfile_name, x_shape, y_shape, {}dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None):\n'\
+                        .format(w_shapes)
+            elif '#hls-fpga-machine-learning insert registers' in line:
+                newline = line
+                for b in brams:
+                    newline += indent + indent + 'self.reg{} = self.myproject_axi_0.register_map.{}.address\n'.format(b.name, b.name)
+            elif '#hls-fpga-machine-learning insert buffers' in line:
+                newline = line
+                for b in brams:
+                    newline += indent + indent + 'self.{}_buffer = allocate(shape={}_shape, dtype=dtype)\n'.format(b.name, b.name)
+            elif '#hls-fpga-machine-learning insert load weights' in line:
+                newline = line
+                weights = ''
+                for b in brams:
+                    weights += b.name + ', '
+                newline = 'def load_weights(self, {}debug=False, profile=False, encode=None):\n'.format(weights)
+            elif '#hls-fpga-machine-learning insert encode' in line:
+                newline = line
+                for b in brams:
+                    newline += indent + indent + '{} = encode({})\n'.format(b.name, b.name)
+            elif '#hls-fpga-machine-learning insert set buffers' in line:
+                newline = line
+                for b in brams:
+                    newline += indent + indent + 'self.{}_buffer[:] = {}\n'.format(b.name, b.name)
+            elif '#hls-fpga-machine-learning insert set registers' in line:
+                newline = line
+                for b in brams:
+                    newline += indent + indent + 'self.myproject_axi_0.write(self.reg{}, self.{}_buffer.physical_address)\n'.format(b.name, b.name)
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+
     def write_board_script(self, model):
         '''
         Write the tcl scripts and kernel sources to create a Vivado IPI project for the VivadoAccelerator
@@ -348,14 +481,133 @@ def write_board_script(self, model):
 
     def write_driver(self, model):
         filedir = os.path.dirname(os.path.abspath(__file__))
-        copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()),
-                 ('{}/' + self.vivado_accelerator_config.get_driver_file()).format(model.config.get_output_dir()))
-        
+        srcfiles = os.path.join(filedir, self.vivado_accelerator_config.get_driver_path())
+        dstfiles = ('{}/' + self.vivado_accelerator_config.get_driver_files()).format(model.config.get_output_dir())
+        if os.path.isdir(srcfiles):
+            copytree(srcfiles, dstfiles, dirs_exist_ok=True)
+        else:
+            copyfile(srcfiles, dstfiles)
+
     def write_new_tar(self, model):
         os.remove(model.config.get_output_dir() + '.tar.gz')
         super(VivadoAcceleratorWriter, self).write_tar(model)
 
-        
+    def write_standalone_app(self, model):
+
+        indent = '    '
+
+        weights = model.get_weight_variables()
+
+        io_type = model.config.get_config_value('IOType')
+        interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None
+        driver = model.config.get_config_value('AcceleratorConfig')['Driver'] if model.config.get_config_value('AcceleratorConfig') else None
+        config_weights = (io_type == 'io_stream') and (interface == 'axi_master')
+
+        if driver == 'python':
+            return;
+
+        #######################
+        ## main.c
+        #######################
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/vivado_accelerator/standalone_main.c'), 'r')
+        fout = open('{}/sdk/common/main.c'.format(model.config.get_output_dir()), 'w')
+
+        for line in f.readlines():
+
+            if config_weights and '/*hls-fpga-machine-learning insert configure weights*/' in line:
+                newline = line
+                for w in weights:
+                    newline += indent + 'XMyproject_axi_Set_{name}(&accelerator, {name}); /* TODO: design-dependent name */\n'.format(name=w.name)
+            elif config_weights and '/*hls-fpga-machine-learning insert load weights on*/' in line:
+                newline = line
+                newline += indent + 'XMyproject_axi_Set_load_weights(&accelerator, 1); /* TODO: design-dependent name */\n'
+            elif config_weights and '/*hls-fpga-machine-learning insert load weights off*/' in line:
+                newline = line
+                newline += indent + indent + 'XMyproject_axi_Set_load_weights(&accelerator, 0); /* TODO: design-dependent name */\n'
+            elif config_weights and '/*hls-fpga-machine-learning insert start and wait*/' in line:
+                newline = line
+                newline += indent + 'XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */\n'
+                newline += indent + 'while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */\n'
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+
+    def write_header_file(model, X, y, y_keras, y_hls, n_samples, filename='data.h'):
+        #TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package
+        from hls4ml.backends import VivadoAcceleratorConfig
+        vivado_accelerator_config = VivadoAcceleratorConfig(model.config, model.get_input_variables(),
+                                                            model.get_output_variables())
+        inp_axi_t, out_axi_t, inp, out = vivado_accelerator_config.get_corrected_types()
+        header_file = open(filename, 'w')
+        (n_X_samples, n_X_inputs) = X.shape
+        (n_y_samples, n_y_outputs) = y.shape
+        (n_y_keras_samples, n_y_keras_outputs) = y_keras.shape
+        (n_y_hls_samples, n_y_hls_outputs) = y_hls.shape
+
+        header_file.write('#ifndef __DATA_H__\n')
+        header_file.write('#define __DATA_H__\n')
+        header_file.write('/* out of {} */\n'.format(n_X_samples))
+        header_file.write('#define N_SAMPLES {}\n'.format(n_samples))
+        header_file.write('\n')
+
+        import numpy as np
+        for layer in model.get_layers():
+            for weights in layer.get_weights():
+                header_file.write('#define N_{name} {size}\n'.format(name=weights.name.upper(), size=np.prod(weights.shape)))
+                header_file.write('const {dtype} {name}[N_{uname}] = {{\n'.format(dtype=inp_axi_t, name=weights.name, uname=weights.name.upper()))
+                for w in weights:
+                    header_file.write(w + ',')
+                header_file.write('};\n\n')
+
+        header_file.write('#define N_X_INPUTS {}\n'.format(n_X_inputs))
+        header_file.write('const {} data_X_inputs[N_SAMPLES*N_X_INPUTS] = {{\n'.format(inp_axi_t))
+        for s in range(n_samples):
+            header_file.write('    ')
+            for i in range(n_X_inputs):
+                header_file.write('{}, '.format(X[s][i]))
+            header_file.write('\n')
+        header_file.write('};\n')
+        header_file.write('\n')
+        header_file.write('/* Ground truth - for validation */\n')
+        header_file.write('#define N_Y_OUTPUTS {}\n'.format(n_y_outputs))
+        header_file.write('const float data_y_outputs[N_SAMPLES*N_Y_OUTPUTS] = {\n')
+        for s in range(n_samples):
+            header_file.write('    ')
+            for o in range(n_y_outputs):
+                header_file.write('{}, '.format(y[s][o]))
+            header_file.write('\n')
+        header_file.write('};\n')
+        header_file.write('\n')
+        header_file.write('/* Keras outputs - for validation */\n')
+        header_file.write('#define N_Y_KERAS_OUTPUTS {}\n'.format(n_y_keras_outputs))
+        header_file.write('')
+        header_file.write('const float data_y_keras_outputs[N_SAMPLES*N_Y_KERAS_OUTPUTS] = {\n')
+        for s in range(n_samples):
+            header_file.write('    ')
+            for o in range(n_y_keras_outputs):
+                header_file.write('{}, '.format(y_keras[s][o]))
+            header_file.write('\n')
+        header_file.write('};\n')
+        header_file.write('\n')
+        header_file.write('/* csim outputs - for verification */\n')
+        header_file.write('#define N_Y_HLS_OUTPUTS {}\n'.format(n_y_hls_outputs))
+        header_file.write('')
+        header_file.write('const {} data_y_hls_outputs[N_SAMPLES*N_Y_HLS_OUTPUTS] = {{\n'.format(out_axi_t))
+        for s in range(n_samples):
+            header_file.write('    ')
+            for o in range(n_y_hls_outputs):
+                header_file.write('{}, '.format(y_hls[s][o]))
+            header_file.write('\n')
+        header_file.write('};\n')
+        header_file.write('#endif\n')
+        header_file.close()
+
+
     def write_hls(self, model):
         """
         Write the HLS project. Calls the VivadoBackend writer, and extra steps for VivadoAccelerator/AXI interface
@@ -369,6 +621,8 @@ def write_hls(self, model):
         self.write_driver(model)
         self.write_wrapper_test(model)
         self.write_axi_wrapper(model)
+        self.write_standalone_app(model)
+        self.modify_python_driver(model)
         self.modify_build_script(model)
         self.write_new_tar(model)
 
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index af1f1b61ba..8494b49fb4 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -109,6 +109,10 @@ def write_project_cpp(self, model):
         model_outputs = model.get_output_variables()
         model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
 
+        io_type = model.config.get_config_value('IOType')
+        interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None
+        config_weights = (io_type == 'io_stream') and (interface == 'axi_master')
+
         indent = '    '
 
         for line in f.readlines():
@@ -159,7 +163,11 @@ def write_project_cpp(self, model):
                 if io_type == 'io_stream':
                     newline += indent + '#pragma HLS INTERFACE axis port={},{} \n'.format(','.join(all_inputs), ','.join(all_outputs))
                     if all_brams:
-                        newline += indent + '#pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams))
+                        if config_weights:
+                            newline += indent + '//#pragma HLS INTERFACE bram port={} // Disabled (it fails on FPGA otherwise)\n'\
+                                    .format(','.join(all_brams))
+                        else:
+                            newline += indent + '#pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams))
                     newline += indent + '#pragma HLS DATAFLOW \n'
 
             elif '//hls-fpga-machine-learning insert layers' in line:
@@ -337,6 +345,10 @@ def write_test_bench(self, model):
         ## test bench
         ###################
 
+        io_type = model.config.get_config_value('IOType')
+        interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None
+        config_weights = (io_type == 'io_stream') and (interface == 'axi_master')
+
         filedir = os.path.dirname(os.path.abspath(__file__))
 
         if not os.path.exists('{}/tb_data/'.format(model.config.get_output_dir())):
@@ -362,7 +374,7 @@ def write_test_bench(self, model):
 
         model_inputs = model.get_input_variables()
         model_outputs = model.get_output_variables()
-        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram' or config_weights]
 
         for line in f.readlines():
             indent = ' ' * (len(line) - len(line.lstrip(' ')))
@@ -370,10 +382,19 @@ def write_test_bench(self, model):
             #Insert numbers
             if 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
-            elif '//hls-fpga-machine-learning insert bram' in line:
+            elif (not config_weights) and '//hls-fpga-machine-learning insert bram' in line:
                 newline = line
                 for bram in model_brams:
                     newline += '#include \"firmware/weights/{}.h\"\n'.format(bram.name)
+            elif config_weights and '//hls-fpga-machine-learning insert weights' in line:
+                newline = line
+                for v in model.get_weight_variables():
+                    newline += indent + 'model_axi_t {name}[{shape}];\n'.format(name=v.name, shape=v.data_length)
+            elif config_weights and '//hls-fpga-machine-learning insert load weights' in line:
+                newline = line
+                for v in model.get_weight_variables():
+                    newline += indent + 'nnet::load_weights_from_txt<model_axi_t, {shape}>({name}, "{name}.txt");\n'\
+                            .format(name=v.name, shape=v.data_length)
             elif '//hls-fpga-machine-learning insert data' in line:
                 newline = line
                 offset = 0
@@ -400,7 +421,12 @@ def write_test_bench(self, model):
                 # Concatenate the input, output, and bram variables. Filter out empty/null values
                 all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
 
-                top_level = indent + '{}({});\n'.format(model.config.get_project_name(), all_vars)
+                if config_weights:
+                    top_level = indent + '{}({},/*load_weights*/true);\n'.format(model.config.get_project_name(), all_vars)
+                    newline += top_level
+                    top_level = indent + '{}({},/*load_weights*/false);\n'.format(model.config.get_project_name(), all_vars)
+                else:
+                    top_level = indent + '{}({});\n'.format(model.config.get_project_name(), all_vars)
 
                 newline += top_level
             elif '//hls-fpga-machine-learning insert predictions' in line:
@@ -429,6 +455,10 @@ def write_bridge(self, model):
         # c++-python bridge
         ###################
 
+        io_type = model.config.get_config_value('IOType')
+        interface = model.config.get_config_value('AcceleratorConfig')['Interface'] if model.config.get_config_value('AcceleratorConfig') else None
+        config_weights = (io_type == 'io_stream') and (interface == 'axi_master')
+
         filedir = os.path.dirname(os.path.abspath(__file__))
         f = open(os.path.join(filedir,'../templates/vivado/myproject_bridge.cpp'),'r')
         fout = open('{}/{}_bridge.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()),'w')
@@ -455,8 +485,11 @@ def write_bridge(self, model):
                 outputs_str = ', '.join(['{type} {name}[{shape}]'.format(type=dtype, name=o.name, shape=o.size_cpp()) for o in model_outputs])
 
                 newline = ''
-                newline += indent + inputs_str + ',\n'
-                newline += indent + outputs_str + '\n'
+                newline += indent + inputs_str + '\n'
+                newline += indent + ', ' + outputs_str + '\n'
+                if config_weights:
+                    for v in model.get_weight_variables():
+                        newline += indent + ', {type} {name} [{shape}]\n'.format(type=dtype, name=v.name, shape=v.data_length)
             elif '//hls-fpga-machine-learning insert wrapper' in line:
                 dtype = line.split('#', 1)[1].strip()
                 newline = ''
@@ -465,6 +498,12 @@ def write_bridge(self, model):
                     newline += indent + 'nnet::convert_data<{}, {}, {}>({}, {}_ap);\n'.format(dtype, i.type.name, i.size_cpp(), i.name, i.name)
                 newline += '\n'
 
+                if config_weights:
+                    for b in model_brams:
+                        newline += indent + 'model_axi_t {name}_ap[{shape}];\n'.format(name=b.name, shape=b.data_length)
+                        newline += indent + 'nnet::convert_data<{}, {}, {}>({}, {}_ap);\n'.format(dtype, 'model_axi_t', b.data_length, b.name, b.name)
+                    newline += '\n'
+
                 for o in model_outputs:
                     newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap'))
 
@@ -476,8 +515,11 @@ def write_bridge(self, model):
 
                 # Concatenate the input, output, and bram variables. Filter out empty/null values
                 all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
-
-                top_level = indent + '{}({});\n'.format(model.config.get_project_name(), all_vars)
+                if config_weights:
+                    top_level = indent + '{}({},/*load_weights*/true);\n'.format(model.config.get_project_name(), all_vars)
+                    top_level += indent + '{}({},/*load_weights*/false);\n'.format(model.config.get_project_name(), all_vars)
+                else:
+                    top_level = indent + '{}({});\n'.format(model.config.get_project_name(), all_vars)
                 newline += top_level
 
                 newline += '\n'