1+ #ifndef NNET_CONV1D_STREAM_H_
2+ #define NNET_CONV1D_STREAM_H_
3+
4+ #include " nnet_types.h"
5+ #include " nnet_dense.h"
6+
7+ namespace nnet {
8+
9+ /*
10+ * void kernel_shift(shift_buffer, kernel_window)
11+ *
12+ * Args:
13+ * shift_buffer - array elements popped from the line the buffer during the shift line buffer operation
14+ * kernel_window - array of values from the input curently being convolved with the kernel
15+ *
16+ * Values from shift_buffer are inserted into kernel_window, updating the values to be convolved
17+ */
18+ template <class data_T , typename CONFIG_T>
19+ void kernel_shift_1d (
20+ typename data_T::value_type shift_buffer[CONFIG_T::n_chan],
21+ typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]
22+ ) {
23+ /*
24+ * Manually shift kernel_window by one step to the left
25+ * Not possible to use nnet::shift_reg<T, N> as the kernel window is convolved with the kernel weights using dense matrix multiplication
26+ * Dense matrix multiplication is only implemented for arrays
27+ * However, provided certain timing constrains are met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register
28+ * To verify, see synthesis report in report.html > Area Analysis of System
29+ */
30+ KernelShiftWidth:
31+ #pragma unroll
32+ for (int col = 0 ; col < CONFIG_T::filt_width - 1 ; col++) {
33+ KernelShiftChannel:
34+ #pragma unroll
35+ for (int channel = 0 ; channel < CONFIG_T::n_chan; channel++) {
36+ kernel_window[col * CONFIG_T::n_chan + channel] = kernel_window[(col + 1 ) * CONFIG_T::n_chan + channel];
37+ }
38+ }
39+
40+ // Insert shift_buffer values into the last column of the kernel window
41+ KernelPushChannel:
42+ #pragma unroll
43+ for (int channel = 0 ; channel < CONFIG_T::n_chan; channel++) {
44+ kernel_window[(CONFIG_T::filt_width - 1 ) * CONFIG_T::n_chan + channel] = shift_buffer[channel];
45+ }
46+ }
47+
48+ /*
49+ * void shift_line_buffer(in_element, line_buffer, shift_buffer)
50+ *
51+ * Args:
52+ * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number of channels
53+ * line_buffer - chained array of shift registers, one for each row of the kernel and channel
54+ * shift_buffer - array elements popped from the line the buffer during the shift operation
55+ *
56+ * Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one
57+ * Popped elements are later used to update the kernel window, during the kernel_shift operation
58+ */
59+ template <class data_T , typename CONFIG_T>
60+ void shift_line_buffer_1d (
61+ const data_T &in_elem,
62+ nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right> line_buffer[CONFIG_T::n_chan],
63+ typename data_T::value_type shift_buffer[CONFIG_T::n_chan]
64+ ) {
65+ // For every channel, insert the incoming pixel at end of the shift buffer
66+ UpdateBuffer:
67+ #pragma unroll
68+ for (int channel = 0 ; channel < CONFIG_T::n_chan; channel++) {
69+ shift_buffer[channel] = in_elem[channel];
70+ }
71+ }
72+
73+ /*
74+ * void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases)
75+ *
76+ * Args:
77+ * in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number of channels
78+ * res_stream - output stream, passed by reference to allow direct writing
79+ * line_buffer - chained array of shift registers, one for each row of the kernel and channel
80+ * kernel_window - array of values from the input curently convolved with the kernel
81+ * weights - Conv1D layer weights
82+ * biases - Conv1D layer biases
83+ *
84+ * Function executes 4 steps:
85+ * (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last elements
86+ * (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from the line buffer
87+ * (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and kernel weights
88+ * (4) Counter housekeeping - keeps track of current pixel and stride
89+ */
90+ template <class data_T , class res_T , typename CONFIG_T>
91+ void compute_output_buffer_1d (
92+ const data_T &in_elem,
93+ stream<res_T> &res_stream,
94+ nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right> line_buffer[CONFIG_T::n_chan],
95+ typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan],
96+ const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
97+ const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
98+ ) {
99+ // Thresholds
100+ static constexpr int lShiftX = CONFIG_T::filt_width - 1 ;
101+
102+ // X position pixel
103+ static int pX = 0 ;
104+
105+ // X strides
106+ static int sX = 0 ;
107+
108+ // Step 1 - Shift line buffer
109+ hls_register typename data_T::value_type shift_buffer[CONFIG_T::n_chan];
110+ nnet::shift_line_buffer_1d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
111+
112+ // Step 2 - Kernel shift
113+ nnet::kernel_shift_1d<data_T, CONFIG_T>(shift_buffer, kernel_window);
114+
115+ // Check to see if we have a full kernel
116+ if ((sX - lShiftX) == 0 && pX > (lShiftX - 1 )) {
117+ // Step 3 - Dense matrix multiplication
118+ hls_register typename res_T::value_type res_out[CONFIG_T::n_filt];
119+ dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(kernel_window, res_out, weights, biases);
120+
121+ // Write result to output stream
122+ hls_register res_T res_pack;
123+ CastLoop:
124+ #pragma unroll
125+ for (int channel = 0 ; channel < CONFIG_T::n_filt; channel++) {
126+ res_pack[channel] = res_out[channel];
127+ }
128+ res_stream.write (res_pack);
129+ }
130+
131+ // Reached end of image
132+ if ((pX + 1 ) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
133+ pX = 0 ;
134+ sX = 0 ;
135+ // Move to the right
136+ } else {
137+ pX++;
138+ sX = ((sX - lShiftX) == 0 ) ? (sX - CONFIG_T::stride_width + 1 ) : (sX + 1 );
139+ }
140+ }
141+
142+
143+ template <class data_T , class res_T , typename CONFIG_T>
144+ void conv_1d_cl (
145+ stream<data_T> &data,
146+ stream<res_T> &res,
147+ const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
148+ const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
149+ ) {
150+ // Line buffer and kernel window
151+ hls_register static nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right> line_buffer[CONFIG_T::n_chan];
152+ hls_register static typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
153+
154+ // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
155+ static const data_T padds (0 );
156+
157+ // Input image left-side padding
158+ PaddingLeftWidth:
159+ for (int col = 0 ; col < CONFIG_T::pad_left; col++) {
160+ compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
161+ }
162+
163+ // Read input image
164+ ReadInputWidth:
165+ for (int col = 0 ; col < CONFIG_T::in_width; col++) {
166+ compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read (), res, line_buffer, kernel_window, weights, biases);
167+ }
168+
169+ // Input image right-side padding
170+ PaddingRightWidth:
171+ for (int col = 0 ; col < CONFIG_T::pad_right; col++) {
172+ compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
173+ }
174+ }
175+
176+ }
177+
178+ #endif
0 commit comments