@@ -81,18 +81,18 @@ void UDotUnit2LstmPe(const int vect_length, const int num_tiles,
8181#pragma HLS FUNCTION_INSTANTIATE variable=num_iter
8282#pragma HLS FUNCTION_INSTANTIATE variable=num_tiles
8383#pragma HLS FUNCTION_INSTANTIATE variable=num_timesteps
84- // #pragma HLS INTERFACE ap_ctrl_none port=return
8584 assert (vect_length % num_tiles == 0 );
86-
8785 const int kNumElemsPerTile = vect_length / num_tiles;
8886 AccumType y1_mac = 0 ;
8987 AccumType y2_mac = 0 ;
90-
91- ReduceProd_PE_IterTimesteps_Loop:
88+ U_PE_Loop:
9289 for (int i = 0 ; i < num_iter * num_timesteps; ++i) {
93- ReduceProd_PE_Loop:
9490 for (int j = 0 ; j < kNumElemsPerTile ; ++j) {
91+ #ifdef __VITIS_HLS__
9592#pragma HLS PIPELINE II=1 style=frp
93+ #else
94+ #pragma HLS PIPELINE II=1
95+ #endif
9696 if (j == 0 ) {
9797 y1_mac = 0 ;
9898 y2_mac = 0 ;
@@ -114,138 +114,11 @@ void UDotUnit2LstmPe(const int vect_length, const int num_tiles,
114114 }
115115}
116116
117- /* *
118- * @brief Accumulate partial results from ReduceProd PEs.
119- *
120- * @param acc1_streams The acc 1 streams, each from a PE
121- * @param acc2_streams The acc 2 streams, each from a PE
122- * @param y1_stream The single y_1 stream
123- * @param y2_stream The single y_2 stream
124- *
125- * @tparam VectLength The input vector dimension
126- * @tparam NumTiles The number of used tiles (to determine the
127- * number of PEs)
128- * @tparam NumZeroTiles The number of pruned tiles (to determine the
129- * number of PEs)
130- * @tparam NumIter The number of refinement steps (to make the
131- * pipeline longer)
132- * @tparam NumTimesteps The number of LSTM timesteps (to make the
133- * pipeline longer)
134- * @tparam AdderTreeDesign Enable or disable AdderTree design. Default is
135- * active, i.e. true.
136- */
137- template <int VectLength, int NumTiles, int NumZeroTiles, int NumIter,
138- int NumTimesteps, bool AdderTreeDesign = true >
139- void UDotUnit2LstmAccumulator (svd::AccumStream (&acc1_streams)[NumTiles-NumZeroTiles],
140- svd::AccumStream (&acc2_streams)[NumTiles-NumZeroTiles],
141- svd::ActivationStream &y1_stream,
142- svd::ActivationStream &y2_stream) {
143- #pragma HLS INLINE off
144- // #pragma HLS INTERFACE ap_ctrl_none port=return
145- const int kNumPEs = NumTiles - NumZeroTiles;
146-
147- if (AdderTreeDesign) {
148- // Determine the number of ranks for the adder tree and declare array
149- // - The adder_tree is larger than required as each rank only needs to be half the size of the previous rank
150- const unsigned kNumPEsLog2 = hlsutils::log2<kNumPEs >::value;
151- const unsigned kNumPEsSub1Log2 = hlsutils::log2<kNumPEs - 1 >::value;
152- const unsigned kNumRanks = kNumPEsLog2 != kNumPEsSub1Log2 ? kNumPEsLog2 : kNumPEsLog2 + 1 ;
153- svd::AccumD adder_tree1[kNumRanks ][kNumPEs ];
154- svd::AccumD adder_tree2[kNumRanks ][kNumPEs ];
155-
156- unsigned rank_size = kNumPEs ;
157-
158- for (int i = 0 ; i < NumIter * NumTimesteps; ++i) {
159- #pragma HLS PIPELINE II=1 style=frp
160- add_level_loop:
161- for (int adder_tree_rank = kNumRanks - 1 ; adder_tree_rank >= 0 ; --adder_tree_rank) {
162- const bool kLoopInit = adder_tree_rank == kNumRanks - 1 ? true : false ;
163- const bool kLoopEpilog = adder_tree_rank == 0 ? true : false ;
164-
165- if (kLoopInit ) {
166- rank_size = kNumPEs ;
167- }
168-
169- const bool prev_rank_is_odd = rank_size % 2 == 0 ? false : true ;
170- rank_size = (rank_size + 1 ) / 2 ;
171- // std::cout << "[" << adder_tree_rank << "] rank_size: " << rank_size << "\n";
172-
173- add_col_loop:
174- for (int jj = 0 ; jj < (kNumPEs + 1 ) / 2 ; ++jj) {
175- if (jj < rank_size) {
176- if (prev_rank_is_odd && jj == rank_size - 1 ) {
177- // Bypass, no adder required.
178- if (kLoopInit ) {
179- adder_tree1[adder_tree_rank][jj] = acc1_streams[jj * 2 ].read ();
180- adder_tree2[adder_tree_rank][jj] = acc2_streams[jj * 2 ].read ();
181- // std::cout << "\t\tstream[" << adder_tree_rank << "][" << jj * 2 << "] = [" << jj << "]\n";
182- } else {
183- adder_tree1[adder_tree_rank][jj] = adder_tree1[adder_tree_rank + 1 ][jj * 2 ];
184- adder_tree2[adder_tree_rank][jj] = adder_tree2[adder_tree_rank + 1 ][jj * 2 ];
185- // std::cout << "\t\tbuffer[" << adder_tree_rank << "][" << jj * 2 << "] = [" << adder_tree_rank + 1 << "][" << jj << "]\n";
186- }
187- } else {
188- if (kLoopInit ) {
189- auto y1_acc = acc1_streams[jj * 2 ].read () + acc1_streams[jj * 2 + 1 ].read ();
190- auto y2_acc = acc2_streams[jj * 2 ].read () + acc2_streams[jj * 2 + 1 ].read ();
191- #pragma HLS RESOURCE variable=y1_acc core=AddSub_DSP
192- #pragma HLS RESOURCE variable=y2_acc core=AddSub_DSP
193- adder_tree1[adder_tree_rank][jj] = y1_acc;
194- adder_tree2[adder_tree_rank][jj] = y2_acc;
195- // std::cout << "\tstreams[" << adder_tree_rank << "][" << jj << "] = [" << jj * 2 << "] + [" << jj * 2 + 1 << "]\n";
196- } else {
197- auto y1_acc = adder_tree1[adder_tree_rank + 1 ][jj * 2 ] + adder_tree1[adder_tree_rank + 1 ][jj * 2 + 1 ];
198- auto y2_acc = adder_tree2[adder_tree_rank + 1 ][jj * 2 ] + adder_tree2[adder_tree_rank + 1 ][jj * 2 + 1 ];
199- #pragma HLS RESOURCE variable=y1_acc core=AddSub_DSP
200- #pragma HLS RESOURCE variable=y2_acc core=AddSub_DSP
201- adder_tree1[adder_tree_rank][jj] = y1_acc;
202- adder_tree2[adder_tree_rank][jj] = y2_acc;
203- // std::cout << "\tbuffer[" << adder_tree_rank << "][" << jj << "] = [" << adder_tree_rank + 1 << "][" << jj * 2 << "] + [" << adder_tree_rank + 1 << "][" << jj * 2 + 1 << "]\n";
204- }
205- }
206- }
207- }
208- if (kLoopEpilog ) {
209- y1_stream.write (adder_tree1[0 ][0 ]);
210- y2_stream.write (adder_tree2[0 ][0 ]);
211- // std::cout << "\n";
212- }
213- }
214- }
215- } else {
216- svd::AccumD y1_acc = 0 ;
217- svd::AccumD y2_acc = 0 ;
218- for (int i = 0 ; i < NumIter * NumTimesteps; ++i) {
219- AdderTree_PE_Loop:
220- for (int j = 0 ; j < kNumPEs ; ++j) {
221- #pragma HLS PIPELINE II=1 style=frp
222- if (j == 0 ) {
223- y1_acc = 0 ;
224- y2_acc = 0 ;
225- }
226- auto acc1 = y1_acc + acc1_streams[j].read ();
227- auto acc2 = y2_acc + acc2_streams[j].read ();
228- #pragma HLS RESOURCE variable=acc1 core=AddSub_DSP
229- #pragma HLS RESOURCE variable=acc2 core=AddSub_DSP
230- y1_acc = acc1;
231- y2_acc = acc2;
232- if (j == kNumPEs - 1 ) {
233- y1_stream.write (y1_acc);
234- y2_stream.write (y2_acc);
235- }
236- }
237- }
238- }
239- }
240-
241117/* *
242118 * @brief Reduce Product Unit of an LSTM gate. It Computes the parallel dot
243119 * product between input x and a U vector. It also performs the
244120 * refinement steps and feeds the Element Wise Product Unit.
245121 *
246- * @todo (22/03/2019 - algorithm): The INTERNAL_BUFFER design needs to be
247- * updated with the NumIter and NumTimesteps iterations.
248- *
249122 * @param[in] x1_streams The input x of LSTM n.1
250123 * @param[in] x2_streams The input x of LSTM n.2
251124 * @param[in] gate_u_streams The common U weight vector component
@@ -255,136 +128,46 @@ void UDotUnit2LstmAccumulator(svd::AccumStream (&acc1_streams)[NumTiles-NumZeroT
255128 * @tparam VectLength The length of the weight vector
256129 * @tparam NumTiles The number of tiles the vector is divided into
257130 * @tparam NumZeroTiles The number of zeroed, i.e. pruned, tiles
131+ * @tparam NumIter The number of refinement steps
132+ * @tparam NumTimesteps The number of LSTM timesteps
258133 */
259134template <int VectLength, int NumTiles, int NumZeroTiles, int NumIter,
260135 int NumTimesteps>
261136void UDotUnit2Lstm (svd::ActivationStream (&x1_streams)[NumTiles-NumZeroTiles],
262- svd::ActivationStream (&x2_streams)[NumTiles-NumZeroTiles],
263- WeightStream (&gate_u_streams)[NumTiles-NumZeroTiles],
264- svd::ActivationStream &y1,
265- svd::ActivationStream &y2) {
137+ svd::ActivationStream (&x2_streams)[NumTiles-NumZeroTiles],
138+ WeightStream (&gate_u_streams)[NumTiles-NumZeroTiles],
139+ svd::ActivationStream &y1,
140+ svd::ActivationStream &y2) {
266141 assert (VectLength % NumTiles == 0 );
267142 assert (NumZeroTiles < NumTiles);
268- assert (NumTiles >= 8 );
269143 assert (NumTiles % 2 == 0 );
270- // =============================================================================
271- #define REDUCE_PROD_2LSTM_DATAFLOW_DESIGN
272- // #define REDUCE_PROD_2LSTM_MERGE_DSP // the accuracy is killed, possible error.
273- // =============================================================================
274- #if !defined(REDUCE_PROD_2LSTM_DATAFLOW_DESIGN) && \
275- defined (REDUCE_PROD_2LSTM_MERGE_DSP) && FIX_WIDTH == 8
276144#pragma HLS DATAFLOW
277- // ===========================================================================
278- // Implements shared DSP and LUT function for computing 2 mac ops in 1 DSP.
279- // ===========================================================================
280- const int kNumNonZeroTiles = NumTiles - NumZeroTiles;
281- const int kNumPEs = kNumNonZeroTiles ;
282- const int kNumElemsPerTile = VectLength / NumTiles;
283- const int kStreamDepth = NumIter * kNumElemsPerTile ;
284- svd::AccumD y1_mul[kNumPEs ];
285- svd::AccumD y2_mul[kNumPEs ];
286- #pragma HLS ARRAY_PARTITION variable=y1_mul complete dim=1
287- #pragma HLS ARRAY_PARTITION variable=y2_mul complete dim=1
288- #pragma HLS STREAM variable=y1_mul depth=kStreamDepth
289- #pragma HLS STREAM variable=y2_mul depth=kStreamDepth
290-
291- svd::AccumD y1_acc = 0 ;
292- svd::AccumD y2_acc = 0 ;
293- #pragma HLS RESOURCE variable=y1_acc core=AddSub_DSP
294- #pragma HLS RESOURCE variable=y2_acc core=AddSub_DSP
295-
296- for (int n = 0 ; n < NumIter * NumTimesteps; ++n) {
297- ReduceProd_PE_Loop:
298- for (int i = 0 ; i < kNumPEs ; ++i) {
299- #if FIX_WIDTH == 8
300- #pragma HLS ALLOCATION instances=dot_prod_dsp_lut limit=kNumPEs function
301- #else
302- #pragma HLS ALLOCATION instances=dot_prod_dsp_lut_generic limit=kNumPEs function
303- #endif
304- #pragma HLS UNROLL
305- y1_mul[i] = 0 ;
306- y2_mul[i] = 0 ;
307- ReduceProd_Tile_Loop:
308- for (int j = 0 ; j < kNumElemsPerTile / 2 ; ++j) {
309- #pragma HLS PIPELINE II=1 style=frp
310- // auto p0_tmp = y_dsp * w_dsp + y_lut * w_lut;
311- // auto p1_tmp = x_dsp * w_dsp + x_lut * w_lut;
312- // p0 += p0_tmp;
313- // p1 += p1_tmp;
314- #if FIX_WIDTH == 8
315- svd::AccumD x_dsp = 0 ; // x1_streams[i].read();
316- svd::AccumD y_dsp = 0 ; // x2_streams[i].read();
317- svd::AccumD w_dsp = 0 ; // gate_u_streams[i].read();
318- svd::AccumD x_lut = 0 ; // x1_streams[i].read();
319- svd::AccumD y_lut = 0 ; // x2_streams[i].read();
320- svd::AccumD w_lut = 0 ; // gate_u_streams[i].read();
321- x_dsp.range () = x1_streams[i].read ().range ();
322- y_dsp.range () = x2_streams[i].read ().range ();
323- w_dsp.range () = gate_u_streams[i].read ().range ();
324- x_lut.range () = x1_streams[i].read ().range ();
325- y_lut.range () = x2_streams[i].read ().range ();
326- w_lut.range () = gate_u_streams[i].read ().range ();
327- dot_prod_dsp_lut (x_dsp, y_dsp, w_dsp, x_lut, y_lut, w_lut,
328- y2_mul[i], y1_mul[i]);
329- #else
330- svd::AccumD x_dsp = x1_streams[i].read ();
331- svd::AccumD y_dsp = x2_streams[i].read ();
332- svd::AccumD w_dsp = gate_u_streams[i].read ();
333- svd::AccumD x_lut = x1_streams[i].read ();
334- svd::AccumD y_lut = x2_streams[i].read ();
335- svd::AccumD w_lut = gate_u_streams[i].read ();
336- dot_prod_dsp_lut_generic (x_dsp, y_dsp, w_dsp, x_lut, y_lut, w_lut,
337- y2_mul[i], y1_mul[i]);
338- #endif
339- }
340- }
341- }
342-
343- ReduceProd_Accumulation_Loop:
344- for (int i = 0 ; i < NumIter * NumTimesteps; ++i) {
345- for (int j = 0 ; j < kNumPEs ; ++j) {
346- #pragma HLS PIPELINE II=1 style=frp
347- y1_acc += y1_mul[j];
348- y2_acc += y2_mul[j];
349- }
350- y1.write (y1_acc);
351- y2.write (y2_acc);
352- }
353- #else
354- // =============================================================================
355- // Implements #mac_PEs = NumTiles - NumZeroTiles & #Adder_Tree = 1
356- // =============================================================================
357- // #pragma HLS INTERFACE ap_ctrl_none port=return
358- #pragma HLS DATAFLOW
359- // #pragma HLS INLINE
360-
361- const unsigned kNumNonZeroTiles = NumTiles - NumZeroTiles;
362- const unsigned kNumPEs = kNumNonZeroTiles ;
363- // NOTE: both PE and adder-tree have II=1, but the adder-tree reads in round
364- // robin fashion from the PE queues. Hence, before the adder-tree reads again
365- // from the same PE queue, kNumPEs cycles pass. This contrains the depth of
366- // the queues to kNumPEs. (THIS WON'T WORK, TOO LOW CONSUMER RATE)
367- // FIXED: Using an adder tree allows to use a stream of depth 1.
368- const unsigned kStreamDepth = 1 ; // VectLength / NumTiles;
369-
370- hls::stream<svd::AccumD> acc1_streams[kNumNonZeroTiles ];
371- hls::stream<svd::AccumD> acc2_streams[kNumNonZeroTiles ];
145+ #pragma HLS INLINE
146+ const unsigned kNumPEs = NumTiles - NumZeroTiles;
147+ const unsigned kStreamDepth = 2 ;
148+ hls::stream<svd::AccumD> acc1_streams[kNumPEs ];
149+ hls::stream<svd::AccumD> acc2_streams[kNumPEs ];
372150#pragma HLS ARRAY_PARTITION variable=acc1_streams complete dim=1
373151#pragma HLS ARRAY_PARTITION variable=acc1_streams complete dim=1
374152#pragma HLS STREAM variable=acc1_streams depth=kStreamDepth
375153#pragma HLS STREAM variable=acc2_streams depth=kStreamDepth
376-
377154 PE_Loop:
378155 for (int pe = 0 ; pe < kNumPEs ; ++pe) {
379156#pragma HLS UNROLL
380157 UDotUnit2LstmPe<svd::ActivationD, svd::WeightD, svd::AccumD>(VectLength,
381- NumTiles, NumIter, NumTimesteps,
382- x1_streams[pe], x2_streams[pe], gate_u_streams[pe], acc1_streams[pe],
383- acc2_streams[pe]);
158+ NumTiles, NumIter, NumTimesteps, x1_streams[pe], x2_streams[pe],
159+ gate_u_streams[pe], acc1_streams[pe], acc2_streams[pe]);
160+ }
161+ U_AdderTree_Loop:
162+ for (int i = 0 ; i < NumIter * NumTimesteps; ++i) {
163+ #ifdef __VITIS_HLS__
164+ #pragma HLS PIPELINE II=1 style=frp
165+ #else
166+ #pragma HLS PIPELINE II=1
167+ #endif
168+ y1.write (hlsutils::adder_tree<svd::AccumD, kNumPEs >(acc1_streams));
169+ y2.write (hlsutils::adder_tree<svd::AccumD, kNumPEs >(acc2_streams));
384170 }
385- UDotUnit2LstmAccumulator<VectLength, NumTiles, NumZeroTiles, NumIter, NumTimesteps>(
386- acc1_streams, acc2_streams, y1, y2);
387- #endif // end REDUCE_PROD_2LSTM_DATAFLOW_DESIGN
388171}
389172
390173#ifdef __VITIS_HLS__
@@ -416,6 +199,7 @@ void KernelU(const int num_active_inputs,
416199 const int input_size,
417200 const int num_refinements[params::N],
418201 const bool pad_output,
202+ // hls::stream<typename params::IndexU_Type>& z_idx_port,
419203 hls::stream<typename params::VectTuAxiPacketType>& x_port,
420204 hls::stream<typename params::VectTuAxiPacketType>& u_port,
421205 hls::stream<typename WrapperAxisG::PacketType>& xu_port) {
0 commit comments