Skip to content

Commit f652133

Browse files
committed
Cleaned U kernel files
1 parent 90b952a commit f652133

File tree

3 files changed

+63
-285
lines changed

3 files changed

+63
-285
lines changed

include/kernel/u_kernel.h

Lines changed: 29 additions & 245 deletions
Original file line numberDiff line numberDiff line change
@@ -81,18 +81,18 @@ void UDotUnit2LstmPe(const int vect_length, const int num_tiles,
8181
#pragma HLS FUNCTION_INSTANTIATE variable=num_iter
8282
#pragma HLS FUNCTION_INSTANTIATE variable=num_tiles
8383
#pragma HLS FUNCTION_INSTANTIATE variable=num_timesteps
84-
// #pragma HLS INTERFACE ap_ctrl_none port=return
8584
assert(vect_length % num_tiles == 0);
86-
8785
const int kNumElemsPerTile = vect_length / num_tiles;
8886
AccumType y1_mac = 0;
8987
AccumType y2_mac = 0;
90-
91-
ReduceProd_PE_IterTimesteps_Loop:
88+
U_PE_Loop:
9289
for (int i = 0; i < num_iter * num_timesteps; ++i) {
93-
ReduceProd_PE_Loop:
9490
for (int j = 0; j < kNumElemsPerTile; ++j) {
91+
#ifdef __VITIS_HLS__
9592
#pragma HLS PIPELINE II=1 style=frp
93+
#else
94+
#pragma HLS PIPELINE II=1
95+
#endif
9696
if (j == 0) {
9797
y1_mac = 0;
9898
y2_mac = 0;
@@ -114,138 +114,11 @@ void UDotUnit2LstmPe(const int vect_length, const int num_tiles,
114114
}
115115
}
116116

117-
/**
118-
* @brief Accumulate partial results from ReduceProd PEs.
119-
*
120-
* @param acc1_streams The acc 1 streams, each from a PE
121-
* @param acc2_streams The acc 2 streams, each from a PE
122-
* @param y1_stream The single y_1 stream
123-
* @param y2_stream The single y_2 stream
124-
*
125-
* @tparam VectLength The input vector dimension
126-
* @tparam NumTiles The number of used tiles (to determine the
127-
* number of PEs)
128-
* @tparam NumZeroTiles The number of pruned tiles (to determine the
129-
* number of PEs)
130-
* @tparam NumIter The number of refinement steps (to make the
131-
* pipeline longer)
132-
* @tparam NumTimesteps The number of LSTM timesteps (to make the
133-
* pipeline longer)
134-
* @tparam AdderTreeDesign Enable or disable AdderTree design. Default is
135-
* active, i.e. true.
136-
*/
137-
template <int VectLength, int NumTiles, int NumZeroTiles, int NumIter,
138-
int NumTimesteps, bool AdderTreeDesign = true>
139-
void UDotUnit2LstmAccumulator(svd::AccumStream (&acc1_streams)[NumTiles-NumZeroTiles],
140-
svd::AccumStream (&acc2_streams)[NumTiles-NumZeroTiles],
141-
svd::ActivationStream &y1_stream,
142-
svd::ActivationStream &y2_stream) {
143-
#pragma HLS INLINE off
144-
// #pragma HLS INTERFACE ap_ctrl_none port=return
145-
const int kNumPEs = NumTiles - NumZeroTiles;
146-
147-
if (AdderTreeDesign) {
148-
// Determine the number of ranks for the adder tree and declare array
149-
// - The adder_tree is larger than required as each rank only needs to be half the size of the previous rank
150-
const unsigned kNumPEsLog2 = hlsutils::log2<kNumPEs>::value;
151-
const unsigned kNumPEsSub1Log2 = hlsutils::log2<kNumPEs - 1>::value;
152-
const unsigned kNumRanks = kNumPEsLog2 != kNumPEsSub1Log2 ? kNumPEsLog2 : kNumPEsLog2 + 1;
153-
svd::AccumD adder_tree1[kNumRanks][kNumPEs];
154-
svd::AccumD adder_tree2[kNumRanks][kNumPEs];
155-
156-
unsigned rank_size = kNumPEs;
157-
158-
for (int i = 0; i < NumIter * NumTimesteps; ++i) {
159-
#pragma HLS PIPELINE II=1 style=frp
160-
add_level_loop:
161-
for(int adder_tree_rank = kNumRanks - 1; adder_tree_rank >= 0; --adder_tree_rank) {
162-
const bool kLoopInit = adder_tree_rank == kNumRanks - 1 ? true : false;
163-
const bool kLoopEpilog = adder_tree_rank == 0 ? true : false;
164-
165-
if (kLoopInit) {
166-
rank_size = kNumPEs;
167-
}
168-
169-
const bool prev_rank_is_odd = rank_size % 2 == 0 ? false : true;
170-
rank_size = (rank_size + 1) / 2;
171-
// std::cout << "[" << adder_tree_rank << "] rank_size: " << rank_size << "\n";
172-
173-
add_col_loop:
174-
for(int jj = 0; jj < (kNumPEs + 1) / 2; ++jj) {
175-
if (jj < rank_size) {
176-
if (prev_rank_is_odd && jj == rank_size - 1) {
177-
// Bypass, no adder required.
178-
if (kLoopInit) {
179-
adder_tree1[adder_tree_rank][jj] = acc1_streams[jj * 2].read();
180-
adder_tree2[adder_tree_rank][jj] = acc2_streams[jj * 2].read();
181-
// std::cout << "\t\tstream[" << adder_tree_rank << "][" << jj * 2 << "] = [" << jj << "]\n";
182-
} else {
183-
adder_tree1[adder_tree_rank][jj] = adder_tree1[adder_tree_rank + 1][jj * 2];
184-
adder_tree2[adder_tree_rank][jj] = adder_tree2[adder_tree_rank + 1][jj * 2];
185-
// std::cout << "\t\tbuffer[" << adder_tree_rank << "][" << jj * 2 << "] = [" << adder_tree_rank + 1 << "][" << jj << "]\n";
186-
}
187-
} else {
188-
if (kLoopInit) {
189-
auto y1_acc = acc1_streams[jj * 2].read() + acc1_streams[jj * 2 + 1].read();
190-
auto y2_acc = acc2_streams[jj * 2].read() + acc2_streams[jj * 2 + 1].read();
191-
#pragma HLS RESOURCE variable=y1_acc core=AddSub_DSP
192-
#pragma HLS RESOURCE variable=y2_acc core=AddSub_DSP
193-
adder_tree1[adder_tree_rank][jj] = y1_acc;
194-
adder_tree2[adder_tree_rank][jj] = y2_acc;
195-
// std::cout << "\tstreams[" << adder_tree_rank << "][" << jj << "] = [" << jj * 2 << "] + [" << jj * 2 + 1 << "]\n";
196-
} else{
197-
auto y1_acc = adder_tree1[adder_tree_rank + 1][jj * 2] + adder_tree1[adder_tree_rank + 1][jj * 2 + 1];
198-
auto y2_acc = adder_tree2[adder_tree_rank + 1][jj * 2] + adder_tree2[adder_tree_rank + 1][jj * 2 + 1];
199-
#pragma HLS RESOURCE variable=y1_acc core=AddSub_DSP
200-
#pragma HLS RESOURCE variable=y2_acc core=AddSub_DSP
201-
adder_tree1[adder_tree_rank][jj] = y1_acc;
202-
adder_tree2[adder_tree_rank][jj] = y2_acc;
203-
// std::cout << "\tbuffer[" << adder_tree_rank << "][" << jj << "] = [" << adder_tree_rank + 1 << "][" << jj * 2 << "] + [" << adder_tree_rank + 1 << "][" << jj * 2 + 1 << "]\n";
204-
}
205-
}
206-
}
207-
}
208-
if (kLoopEpilog) {
209-
y1_stream.write(adder_tree1[0][0]);
210-
y2_stream.write(adder_tree2[0][0]);
211-
// std::cout << "\n";
212-
}
213-
}
214-
}
215-
} else {
216-
svd::AccumD y1_acc = 0;
217-
svd::AccumD y2_acc = 0;
218-
for (int i = 0; i < NumIter * NumTimesteps; ++i) {
219-
AdderTree_PE_Loop:
220-
for (int j = 0; j < kNumPEs; ++j) {
221-
#pragma HLS PIPELINE II=1 style=frp
222-
if (j == 0) {
223-
y1_acc = 0;
224-
y2_acc = 0;
225-
}
226-
auto acc1 = y1_acc + acc1_streams[j].read();
227-
auto acc2 = y2_acc + acc2_streams[j].read();
228-
#pragma HLS RESOURCE variable=acc1 core=AddSub_DSP
229-
#pragma HLS RESOURCE variable=acc2 core=AddSub_DSP
230-
y1_acc = acc1;
231-
y2_acc = acc2;
232-
if (j == kNumPEs - 1) {
233-
y1_stream.write(y1_acc);
234-
y2_stream.write(y2_acc);
235-
}
236-
}
237-
}
238-
}
239-
}
240-
241117
/**
242118
* @brief Reduce Product Unit of an LSTM gate. It Computes the parallel dot
243119
* product between input x and a U vector. It also performs the
244120
* refinement steps and feeds the Element Wise Product Unit.
245121
*
246-
* @todo (22/03/2019 - algorithm): The INTERNAL_BUFFER design needs to be
247-
* updated with the NumIter and NumTimesteps iterations.
248-
*
249122
* @param[in] x1_streams The input x of LSTM n.1
250123
* @param[in] x2_streams The input x of LSTM n.2
251124
* @param[in] gate_u_streams The common U weight vector component
@@ -255,136 +128,46 @@ void UDotUnit2LstmAccumulator(svd::AccumStream (&acc1_streams)[NumTiles-NumZeroT
255128
* @tparam VectLength The length of the weight vector
256129
* @tparam NumTiles The number of tiles the vector is divided into
257130
* @tparam NumZeroTiles The number of zeroed, i.e. pruned, tiles
131+
* @tparam NumIter The number of refinement steps
132+
* @tparam NumTimesteps The number of LSTM timesteps
258133
*/
259134
template <int VectLength, int NumTiles, int NumZeroTiles, int NumIter,
260135
int NumTimesteps>
261136
void UDotUnit2Lstm(svd::ActivationStream (&x1_streams)[NumTiles-NumZeroTiles],
262-
svd::ActivationStream (&x2_streams)[NumTiles-NumZeroTiles],
263-
WeightStream (&gate_u_streams)[NumTiles-NumZeroTiles],
264-
svd::ActivationStream &y1,
265-
svd::ActivationStream &y2) {
137+
svd::ActivationStream (&x2_streams)[NumTiles-NumZeroTiles],
138+
WeightStream (&gate_u_streams)[NumTiles-NumZeroTiles],
139+
svd::ActivationStream &y1,
140+
svd::ActivationStream &y2) {
266141
assert(VectLength % NumTiles == 0);
267142
assert(NumZeroTiles < NumTiles);
268-
assert(NumTiles >= 8);
269143
assert(NumTiles % 2 == 0);
270-
// =============================================================================
271-
#define REDUCE_PROD_2LSTM_DATAFLOW_DESIGN
272-
// #define REDUCE_PROD_2LSTM_MERGE_DSP // the accuracy is killed, possible error.
273-
// =============================================================================
274-
#if !defined(REDUCE_PROD_2LSTM_DATAFLOW_DESIGN) && \
275-
defined(REDUCE_PROD_2LSTM_MERGE_DSP) && FIX_WIDTH == 8
276144
#pragma HLS DATAFLOW
277-
// ===========================================================================
278-
// Implements shared DSP and LUT function for computing 2 mac ops in 1 DSP.
279-
// ===========================================================================
280-
const int kNumNonZeroTiles = NumTiles - NumZeroTiles;
281-
const int kNumPEs = kNumNonZeroTiles;
282-
const int kNumElemsPerTile = VectLength / NumTiles;
283-
const int kStreamDepth = NumIter * kNumElemsPerTile;
284-
svd::AccumD y1_mul[kNumPEs];
285-
svd::AccumD y2_mul[kNumPEs];
286-
#pragma HLS ARRAY_PARTITION variable=y1_mul complete dim=1
287-
#pragma HLS ARRAY_PARTITION variable=y2_mul complete dim=1
288-
#pragma HLS STREAM variable=y1_mul depth=kStreamDepth
289-
#pragma HLS STREAM variable=y2_mul depth=kStreamDepth
290-
291-
svd::AccumD y1_acc = 0;
292-
svd::AccumD y2_acc = 0;
293-
#pragma HLS RESOURCE variable=y1_acc core=AddSub_DSP
294-
#pragma HLS RESOURCE variable=y2_acc core=AddSub_DSP
295-
296-
for (int n = 0; n < NumIter * NumTimesteps; ++n) {
297-
ReduceProd_PE_Loop:
298-
for (int i = 0; i < kNumPEs; ++i) {
299-
#if FIX_WIDTH == 8
300-
#pragma HLS ALLOCATION instances=dot_prod_dsp_lut limit=kNumPEs function
301-
#else
302-
#pragma HLS ALLOCATION instances=dot_prod_dsp_lut_generic limit=kNumPEs function
303-
#endif
304-
#pragma HLS UNROLL
305-
y1_mul[i] = 0;
306-
y2_mul[i] = 0;
307-
ReduceProd_Tile_Loop:
308-
for (int j = 0; j < kNumElemsPerTile / 2; ++j) {
309-
#pragma HLS PIPELINE II=1 style=frp
310-
// auto p0_tmp = y_dsp * w_dsp + y_lut * w_lut;
311-
// auto p1_tmp = x_dsp * w_dsp + x_lut * w_lut;
312-
// p0 += p0_tmp;
313-
// p1 += p1_tmp;
314-
#if FIX_WIDTH == 8
315-
svd::AccumD x_dsp = 0; // x1_streams[i].read();
316-
svd::AccumD y_dsp = 0; // x2_streams[i].read();
317-
svd::AccumD w_dsp = 0; // gate_u_streams[i].read();
318-
svd::AccumD x_lut = 0; // x1_streams[i].read();
319-
svd::AccumD y_lut = 0; // x2_streams[i].read();
320-
svd::AccumD w_lut = 0; // gate_u_streams[i].read();
321-
x_dsp.range() = x1_streams[i].read().range();
322-
y_dsp.range() = x2_streams[i].read().range();
323-
w_dsp.range() = gate_u_streams[i].read().range();
324-
x_lut.range() = x1_streams[i].read().range();
325-
y_lut.range() = x2_streams[i].read().range();
326-
w_lut.range() = gate_u_streams[i].read().range();
327-
dot_prod_dsp_lut(x_dsp, y_dsp, w_dsp, x_lut, y_lut, w_lut,
328-
y2_mul[i], y1_mul[i]);
329-
#else
330-
svd::AccumD x_dsp = x1_streams[i].read();
331-
svd::AccumD y_dsp = x2_streams[i].read();
332-
svd::AccumD w_dsp = gate_u_streams[i].read();
333-
svd::AccumD x_lut = x1_streams[i].read();
334-
svd::AccumD y_lut = x2_streams[i].read();
335-
svd::AccumD w_lut = gate_u_streams[i].read();
336-
dot_prod_dsp_lut_generic(x_dsp, y_dsp, w_dsp, x_lut, y_lut, w_lut,
337-
y2_mul[i], y1_mul[i]);
338-
#endif
339-
}
340-
}
341-
}
342-
343-
ReduceProd_Accumulation_Loop:
344-
for (int i = 0; i < NumIter * NumTimesteps; ++i) {
345-
for (int j = 0; j < kNumPEs; ++j) {
346-
#pragma HLS PIPELINE II=1 style=frp
347-
y1_acc += y1_mul[j];
348-
y2_acc += y2_mul[j];
349-
}
350-
y1.write(y1_acc);
351-
y2.write(y2_acc);
352-
}
353-
#else
354-
// =============================================================================
355-
// Implements #mac_PEs = NumTiles - NumZeroTiles & #Adder_Tree = 1
356-
// =============================================================================
357-
// #pragma HLS INTERFACE ap_ctrl_none port=return
358-
#pragma HLS DATAFLOW
359-
// #pragma HLS INLINE
360-
361-
const unsigned kNumNonZeroTiles = NumTiles - NumZeroTiles;
362-
const unsigned kNumPEs = kNumNonZeroTiles;
363-
// NOTE: both PE and adder-tree have II=1, but the adder-tree reads in round
364-
// robin fashion from the PE queues. Hence, before the adder-tree reads again
365-
// from the same PE queue, kNumPEs cycles pass. This contrains the depth of
366-
// the queues to kNumPEs. (THIS WON'T WORK, TOO LOW CONSUMER RATE)
367-
// FIXED: Using an adder tree allows to use a stream of depth 1.
368-
const unsigned kStreamDepth = 1; // VectLength / NumTiles;
369-
370-
hls::stream<svd::AccumD> acc1_streams[kNumNonZeroTiles];
371-
hls::stream<svd::AccumD> acc2_streams[kNumNonZeroTiles];
145+
#pragma HLS INLINE
146+
const unsigned kNumPEs = NumTiles - NumZeroTiles;
147+
const unsigned kStreamDepth = 2;
148+
hls::stream<svd::AccumD> acc1_streams[kNumPEs];
149+
hls::stream<svd::AccumD> acc2_streams[kNumPEs];
372150
#pragma HLS ARRAY_PARTITION variable=acc1_streams complete dim=1
373151
#pragma HLS ARRAY_PARTITION variable=acc1_streams complete dim=1
374152
#pragma HLS STREAM variable=acc1_streams depth=kStreamDepth
375153
#pragma HLS STREAM variable=acc2_streams depth=kStreamDepth
376-
377154
PE_Loop:
378155
for (int pe = 0; pe < kNumPEs; ++pe) {
379156
#pragma HLS UNROLL
380157
UDotUnit2LstmPe<svd::ActivationD, svd::WeightD, svd::AccumD>(VectLength,
381-
NumTiles, NumIter, NumTimesteps,
382-
x1_streams[pe], x2_streams[pe], gate_u_streams[pe], acc1_streams[pe],
383-
acc2_streams[pe]);
158+
NumTiles, NumIter, NumTimesteps, x1_streams[pe], x2_streams[pe],
159+
gate_u_streams[pe], acc1_streams[pe], acc2_streams[pe]);
160+
}
161+
U_AdderTree_Loop:
162+
for (int i = 0; i < NumIter * NumTimesteps; ++i) {
163+
#ifdef __VITIS_HLS__
164+
#pragma HLS PIPELINE II=1 style=frp
165+
#else
166+
#pragma HLS PIPELINE II=1
167+
#endif
168+
y1.write(hlsutils::adder_tree<svd::AccumD, kNumPEs>(acc1_streams));
169+
y2.write(hlsutils::adder_tree<svd::AccumD, kNumPEs>(acc2_streams));
384170
}
385-
UDotUnit2LstmAccumulator<VectLength, NumTiles, NumZeroTiles, NumIter, NumTimesteps>(
386-
acc1_streams, acc2_streams, y1, y2);
387-
#endif // end REDUCE_PROD_2LSTM_DATAFLOW_DESIGN
388171
}
389172

390173
#ifdef __VITIS_HLS__
@@ -416,6 +199,7 @@ void KernelU(const int num_active_inputs,
416199
const int input_size,
417200
const int num_refinements[params::N],
418201
const bool pad_output,
202+
// hls::stream<typename params::IndexU_Type>& z_idx_port,
419203
hls::stream<typename params::VectTuAxiPacketType>& x_port,
420204
hls::stream<typename params::VectTuAxiPacketType>& u_port,
421205
hls::stream<typename WrapperAxisG::PacketType>& xu_port) {

include/svd_params.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ struct SvdParameters {
5656
public:
5757
static const int TuBits = TuBits_tmp > 0 ? TuBits_tmp : 1;
5858
static const int TvBits = TvBits_tmp > 0 ? TvBits_tmp : 1;
59+
typedef ap_uint<MaxNumTu> IndexU_Type;
60+
typedef ap_uint<MaxNumTv> IndexV_Type;
5961
typedef ap_uint<MaxNumTu> UnzD;
6062
typedef ap_uint<MaxNumTv> VnzD;
6163
typedef ap_uint<TuBits> UnzIdxD;

0 commit comments

Comments
 (0)