Skip to content

Commit e703238

Browse files
committed
Added vcu128 board to VivadoAccelerator backend
1 parent 0599cca commit e703238

File tree

7 files changed

+1033
-0
lines changed

7 files changed

+1033
-0
lines changed

hls4ml/backends/vivado_accelerator/supported_boards.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
"python_drivers": {"axi_stream": "axi_stream_driver.py"},
1212
"c_drivers": {}
1313
},
14+
"vcu128": {
15+
"part": "xcvu37p-fsvh2892-2L-e",
16+
"tcl_scripts": {"axi_master": "axi_master_design.tcl"},
17+
"python_drivers": {},
18+
"c_drivers": { "axi_master": "axi_master_design.c"}
19+
},
1420
"alveo-u50": {
1521
"part": "xcu50-fsvh2104-2-e",
1622
"tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Branch for adding VCU128 board support to hls4ml. Added and tested with Vivado 2019.1
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
DESIGN := design_1
2+
3+
help:
4+
@echo "INFO: make <TAB> to show targets"
5+
.PHONY: help
6+
7+
--setup:
8+
xsct ./setup.tcl $(DESIGN)
9+
.PHONY: --setup
10+
11+
sdk: --setup
12+
rm -f $(DESIGN)_standalone/src/helloworld.c
13+
cd $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c
14+
cd $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h
15+
.PHONY: sdk
16+
17+
gui:
18+
xsdk --workspace . &
19+
.PHONY: gui
20+
21+
clean:
22+
rm -rf $(DESIGN)_platform
23+
rm -rf $(DESIGN)_standalone
24+
rm -rf $(DESIGN)_standalone_bsp
25+
rm -rf RemoteSystemsTempFiles
26+
rm -rf .Xil
27+
rm -rf .metadata
28+
rm -f *.log
29+
.PHONY: clean
30+
31+
ultraclean: clean
32+
rm -rf hdf/*.hdf
33+
.PHONY: ultraclean
Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
/**
2+
*
3+
* Set Heap Size in ldscript.ld to 0x1000000 (16MB)
4+
*
5+
*/
6+
7+
#include "xmyproject_axi.h" /* TODO: design-dependent name */
8+
#include "stdio.h" /* PRINTF */
9+
#include "unistd.h" /* sleep */
10+
#include "stdlib.h"
11+
#include "malloc.h"
12+
#include "assert.h"
13+
#include "xil_io.h" /* peripheral read/write wrappers */
14+
#include "platform.h" /* platform init/cleanup functions */
15+
#include "xil_cache.h" /* enable/disable caches etc */
16+
#include "xil_printf.h" /* UART debug print functions */
17+
#include "xparameters.h" /* peripherals base addresses */
18+
#include "xtmrctr.h" /* timer, Xilinx IP Timer Counter */
19+
20+
#include "data.h"
21+
22+
/*#define EEMBC_POWER 1
23+
24+
#ifdef EEMBC_POWER
25+
#include "xgpio.h" /* AXI GPIO drivers */
26+
27+
/*#define PIN 0x01
28+
#define GPIO_PMOD_PIN_DEVICE_ID XPAR_GPIO_0_DEVICE_ID
29+
30+
#define set_pin_high(InstancePtr, Mask) \
31+
XGpio_DiscreteWrite(InstancePtr, 1, Mask)
32+
33+
#define set_pin_low(InstancePtr, Mask) \
34+
XGpio_DiscreteClear(InstancePtr, 1, Mask)
35+
36+
XGpio Gpio;
37+
#endif
38+
*/
39+
40+
//#define __DEBUG__
41+
42+
#define MAX_PRINT_ELEMENTS (16)
43+
44+
#define PRINTF printf
45+
46+
const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS;
47+
const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS;
48+
49+
#if 1
50+
/* Accelerator verification */
51+
#define REFERENCE_OUTPUTS data_y_hls_outputs
52+
#else
53+
/* Accelerator validation */
54+
#define REFERENCE_OUTPUTS data_y_outputs
55+
//#define REFERENCE_OUTPUTS data_y_keras_outputs
56+
#endif
57+
58+
unsigned get_max(float *data, unsigned n_elements) {
59+
float max_value = 0.0;
60+
unsigned max_index = 0;
61+
for (unsigned i = 0; i < n_elements; i++)
62+
if (data[i] >= max_value) {
63+
max_index = i;
64+
max_value = data[i];
65+
}
66+
return max_index;
67+
}
68+
69+
float *inputs_mem = NULL;
70+
float *outputs_mem = NULL;
71+
float *reference_mem = NULL;
72+
73+
/* Accelerator configuration */
74+
XMyproject_axi accelerator; /* TODO: design-dependent name */
75+
XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */
76+
77+
/* Accelerator initialization routine */
78+
void init_accelerators() {
79+
PRINTF("INFO: Initializing accelerator\r\n");
80+
accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_0_DEVICE_ID); /* TODO: design-dependent name */
81+
if (accelerator_cfg) {
82+
int status = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */
83+
if (status != XST_SUCCESS) {
84+
PRINTF("ERROR: Initializing accelerator\r\n");
85+
}
86+
}
87+
}
88+
89+
/* Reference implementation of the accelerator in software */
90+
int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) {
91+
#ifdef __DEBUG__
92+
PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n");
93+
#endif
94+
/* See data.h for inputs and outputs */
95+
for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) {
96+
sw_outputs_mem[i] = REFERENCE_OUTPUTS[i];
97+
}
98+
return 0;
99+
}
100+
101+
/* Profiling utilities */
102+
static XTmrCtr TimerCounterInst;
103+
#define TMRCTR_DEVICE_ID XPAR_TMRCTR_0_DEVICE_ID
104+
#define TIMER_CNTR_0 0
105+
#define TIMER_CNTR_1 1
106+
107+
void start_64b_counter() {
108+
XTmrCtr_Start(&TimerCounterInst, TIMER_CNTR_0);
109+
XTmrCtr_Start(&TimerCounterInst, TIMER_CNTR_1);
110+
}
111+
112+
void stop_64b_counter() {
113+
XTmrCtr_Stop(&TimerCounterInst, TIMER_CNTR_0);
114+
XTmrCtr_Stop(&TimerCounterInst, TIMER_CNTR_1);
115+
}
116+
117+
u64 get_64b_counter_value() {
118+
//printf("bytes %u\n\r", sizeof(u64));
119+
u64 lo_counter = XTmrCtr_GetValue(&TimerCounterInst, TIMER_CNTR_0);
120+
u64 hi_counter = XTmrCtr_GetValue(&TimerCounterInst, TIMER_CNTR_1);
121+
u64 counter = (hi_counter << 32) | lo_counter;
122+
//printf("INFO: hi = %lu, lo = %lu, total = %lu\n\r", hi_counter, lo_counter, counter);
123+
return counter;
124+
}
125+
126+
#if 0
127+
double get_elapsed_time(u64 clk_start, u64 clk_stop) {
128+
return ((clk_stop-clk_start) * (1.0/XPAR_AXI_TIMER_MCU_CLOCK_FREQ_HZ));
129+
}
130+
#endif
131+
132+
float get_elapsed_time_ns(u64 clks) {
133+
return clks * 1000000000.0/XPAR_AXI_TIMER_0_CLOCK_FREQ_HZ;
134+
}
135+
136+
137+
/* Dump data to the console */
138+
void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) {
139+
PRINTF("INFO: %s[%u][%u]:\r\n", label, n_samples, feature_count);
140+
/* Print at most MAX_PRINT_ELEMENTS */
141+
for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) {
142+
PRINTF("INFO: [%u] ", i);
143+
for (unsigned j = 0; j < feature_count; j++) {
144+
unsigned index = i * feature_count + j;
145+
PRINTF("%f ", data[index]);
146+
}
147+
PRINTF("\r\n");
148+
}
149+
}
150+
151+
/* The top of the hill :-) */
152+
int main(int argc, char** argv) {
153+
154+
int status;
155+
u64 calibration_time;
156+
double __attribute__ ((unused)) sw_elapsed = 0;
157+
u64 hw_elapsed = 0;
158+
u64 cache_elapsed = 0;
159+
unsigned hw_errors;
160+
161+
char __attribute__ ((unused)) dummy; /* dummy input */
162+
163+
/* Initialize platform (uart and caches) */
164+
init_platform();
165+
166+
PRINTF("\r\n");
167+
PRINTF("INFO: ==================================================\r\n");
168+
PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */
169+
PRINTF("INFO: ==================================================\r\n");
170+
171+
init_accelerators();
172+
173+
/* Timer Counter */
174+
status = XTmrCtr_Initialize(&TimerCounterInst, TMRCTR_DEVICE_ID);
175+
if (status != XST_SUCCESS){
176+
print("ERROR: Timer counter initialization failed \r\n");
177+
return status;
178+
}
179+
180+
XTmrCtr_SetOptions(&TimerCounterInst, TIMER_CNTR_0,
181+
XTC_AUTO_RELOAD_OPTION |
182+
XTC_CASCADE_MODE_OPTION);
183+
184+
print("INFO: Timer counter initialized\r\n");
185+
186+
inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float));
187+
outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
188+
reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
189+
190+
/* Calibration */
191+
start_64b_counter();
192+
sleep(1);
193+
stop_64b_counter();
194+
calibration_time = get_64b_counter_value();
195+
PRINTF("INFO: Time calibration for one second (%lf sec, %llu clk)\r\n", get_elapsed_time_ns(calibration_time), calibration_time);
196+
197+
/* Initialize memory */
198+
PRINTF("INFO: Initialize memory\r\n");
199+
PRINTF("INFO: - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */
200+
PRINTF("INFO: - Inputs count: %u\r\n", N_X_INPUTS);
201+
PRINTF("INFO: - Outputs count: %u\r\n", N_Y_OUTPUTS);
202+
PRINTF("INFO: - Data size: %u B\r\n", sizeof(float));
203+
PRINTF("INFO: - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
204+
PRINTF("INFO: - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
205+
206+
// Set Heap Size in ldscript.ld to 0x1000000 (16MB)
207+
//malloc_stats();
208+
209+
for (int i = 0; i < INPUT_N_ELEMENTS; i++) {
210+
inputs_mem[i] = data_X_inputs[i];
211+
}
212+
for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
213+
outputs_mem[i] = 0x0;
214+
}
215+
216+
/* ****** SW REFERENCE ****** */
217+
PRINTF("INFO: ==================================================\r\n");
218+
PRINTF("INFO: Start SW reference implementation\r\n");
219+
start_64b_counter();
220+
sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS);
221+
stop_64b_counter();
222+
sw_elapsed = get_64b_counter_value();
223+
PRINTF("INFO: ==================================================\r\n");
224+
PRINTF("INFO: Press any key to start:\r\n");
225+
dummy = inbyte();
226+
//PRINTF("INFO:");
227+
228+
/* ****** HW ACCELERATOR ****** */
229+
PRINTF("INFO: Start HW accelerator\r\n");
230+
start_64b_counter();
231+
Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float));
232+
Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
233+
Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float));
234+
stop_64b_counter();
235+
cache_elapsed = get_64b_counter_value();
236+
237+
for (unsigned j = 0; j < N_SAMPLES; j++) {
238+
float *inputs_mem_i = inputs_mem + j * N_X_INPUTS;
239+
float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS;
240+
241+
/* Configure the accelerator */
242+
start_64b_counter();
243+
XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */
244+
XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */
245+
246+
XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */
247+
248+
/* Polling */
249+
while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */
250+
251+
/* Get error status */
252+
//hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */
253+
stop_64b_counter();
254+
hw_elapsed += get_64b_counter_value();
255+
}
256+
257+
start_64b_counter();
258+
Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
259+
stop_64b_counter();
260+
cache_elapsed += get_64b_counter_value();
261+
262+
PRINTF("INFO: HW accelerator done!\r\n");
263+
264+
/* ****** VALIDATION ****** */
265+
PRINTF("INFO: ================== Verification ==================\r\n");
266+
#ifdef __DEBUG__
267+
PRINTF("INFO: Dump data\r\n");
268+
dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS);
269+
dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS);
270+
dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS);
271+
#endif
272+
273+
#ifdef __DEBUG__
274+
PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed);
275+
#endif
276+
PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES);
277+
PRINTF("INFO: - total %f sec\r\n", get_elapsed_time_ns(hw_elapsed));
278+
PRINTF("INFO: - per-inference %.12f sec (%f ns)\r\n", get_elapsed_time_ns(hw_elapsed) / (N_SAMPLES), (get_elapsed_time_ns(hw_elapsed)*1000.0) / (N_SAMPLES));
279+
PRINTF("INFO: Cache flush time: %f sec\r\n", get_elapsed_time_ns(cache_elapsed));
280+
#ifdef __DEBUG__
281+
PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed));
282+
#endif
283+
284+
hw_errors = 0;
285+
#if 1
286+
/* Accelerator verification */
287+
for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
288+
if (outputs_mem[i] != reference_mem[i]) {
289+
PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]);
290+
hw_errors++;
291+
}
292+
}
293+
PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS);
294+
if (hw_errors > 0)
295+
PRINTF("INFO: Verification: FAIL\r\n");
296+
else
297+
PRINTF("INFO: Verification: PASS!\r\n");
298+
#else
299+
/* Accelerator validation */
300+
for (unsigned s = 0; s < N_SAMPLES; s++) {
301+
unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
302+
unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
303+
if (hw_digit != ref_digit) {
304+
#ifdef __DEBUG__
305+
PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit);
306+
#endif
307+
hw_errors++;
308+
}
309+
}
310+
float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0;
311+
float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0);
312+
PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES);
313+
PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate);
314+
PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy);
315+
#endif
316+
317+
PRINTF("INFO: ==================================================\r\n");
318+
319+
320+
#ifdef EEMBC_POWER
321+
/* Initialize the GPIO driver */
322+
status = XGpio_Initialize(&Gpio, GPIO_PMOD_PIN_DEVICE_ID);
323+
if (status != XST_SUCCESS) {
324+
xil_printf("GPIO Initialization Failed\r\n");
325+
return XST_FAILURE;
326+
}
327+
328+
set_pin_low(&Gpio, PIN);
329+
330+
PRINTF("INFO: Connect logic analyzer to the pin 3 of Pmod D\r\n");
331+
PRINTF("INFO: Press any key to start:\r\n");
332+
dummy = inbyte();
333+
334+
/* Loop forever */
335+
for (unsigned i; i < 100; i++) {
336+
set_pin_high(&Gpio, PIN);
337+
338+
sleep(1);
339+
340+
set_pin_low(&Gpio, PIN);
341+
342+
sleep(1);
343+
}
344+
#endif
345+
346+
cleanup_platform();
347+
348+
return 0;
349+
}
350+

0 commit comments

Comments
 (0)