|
13 | 13 | * License for the specific language governing permissions and limitations |
14 | 14 | * under the License. |
15 | 15 | */ |
16 | | -// Work load of each CU |
17 | | -#define BUFFER_SIZE 1024 |
18 | | -#define DATA_SIZE 4096 |
19 | | - |
20 | | -// TRIPCOUNT indentifier |
21 | | -const unsigned int c_len = DATA_SIZE / BUFFER_SIZE; |
22 | | -const unsigned int c_size = BUFFER_SIZE; |
23 | 16 |
|
24 | | -/* |
25 | | - Vector Addition Kernel Implementation |
26 | | - Arguments: |
27 | | - in1 (input) --> Input Vector1 |
28 | | - in2 (input) --> Input Vector2 |
29 | | - out_r (output) --> Output Vector |
30 | | - size (input) --> Size of Vector in Integer |
31 | | -*/ |
| 17 | +/******************************************************************************* |
| 18 | +Description: |
| 19 | + This example uses the load/compute/store coding style which is generally |
| 20 | + the most efficient for implementing kernels using HLS. The load and store |
| 21 | + functions are responsible for moving data in and out of the kernel as |
| 22 | + efficiently as possible. The core functionality is decomposed across one |
| 23 | + of more compute functions. Whenever possible, the compute function should |
| 24 | + pass data through HLS streams and should contain a single set of nested loops. |
| 25 | + HLS stream objects are used to pass data between producer and consumer |
| 26 | + functions. Stream read and write operations have a blocking behavior which |
| 27 | + allows consumers and producers to synchronize with each other automatically. |
| 28 | + The dataflow pragma instructs the compiler to enable task-level pipelining. |
| 29 | + This is required for to load/compute/store functions to execute in a parallel |
| 30 | + and pipelined manner. Here the kernel loads, computes and stores NUM_WORDS integer values per |
| 31 | + clock cycle and is implemented as below: |
| 32 | + _____________ |
| 33 | + | |<----- Input Vector 1 from Global Memory |
| 34 | + | load_input | __ |
| 35 | + |_____________|----->| | |
| 36 | + _____________ | | in1_stream |
| 37 | +Input Vector 2 from Global Memory --->| | |__| |
| 38 | + __ | load_input | | |
| 39 | + | |<---|_____________| | |
| 40 | + in2_stream | | _____________ | |
| 41 | + |__|--->| |<-------- |
| 42 | + | compute_add | __ |
| 43 | + |_____________|---->| | |
| 44 | + ______________ | | out_stream |
| 45 | + | |<---|__| |
| 46 | + | store_result | |
| 47 | + |______________|-----> Output result to Global Memory |
32 | 48 |
|
33 | | -extern "C" { |
34 | | -void vadd(const unsigned int* in1, // Read-Only Vector 1 |
35 | | - const unsigned int* in2, // Read-Only Vector 2 |
36 | | - unsigned int* out_r, // Output Result |
37 | | - int size // Size in integer |
38 | | - ) { |
39 | | - unsigned int v1_buffer[BUFFER_SIZE]; // Local memory to store vector1 |
40 | | - unsigned int v2_buffer[BUFFER_SIZE]; // Local memory to store vector2 |
41 | | - unsigned int vout_buffer[BUFFER_SIZE]; // Local Memory to store result |
| 49 | +*******************************************************************************/ |
42 | 50 |
|
43 | | - // Per iteration of this loop perform BUFFER_SIZE vector addition |
44 | | - for (int i = 0; i < size; i += BUFFER_SIZE) { |
45 | | -#pragma HLS LOOP_TRIPCOUNT min = c_len max = c_len |
46 | | - int chunk_size = BUFFER_SIZE; |
47 | | - // boundary checks |
48 | | - if ((i + BUFFER_SIZE) > size) chunk_size = size - i; |
| 51 | +#include <stdint.h> |
| 52 | +#include <hls_stream.h> |
49 | 53 |
|
50 | | - // Transferring data in bursts hides the memory access latency as well as |
51 | | - // improves bandwidth utilization and efficiency of the memory controller. |
52 | | - // It is recommended to infer burst transfers from successive requests of data |
53 | | - // from consecutive address locations. |
54 | | - // A local memory vl_local is used for buffering the data from a single burst. |
55 | | - // The entire input vector is read in multiple bursts. |
56 | | - // The choice of LOCAL_MEM_SIZE depends on the specific applications and |
57 | | - // available on-chip memory on target FPGA. |
58 | | - // burst read of v1 and v2 vector from global memory |
| 54 | +#define DATA_SIZE 4096 |
59 | 55 |
|
60 | | - // Auto-pipeline is going to apply pipeline to these loops |
61 | | - read1: |
62 | | - for (int j = 0; j < chunk_size; j++) { |
63 | | -#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size |
64 | | - v1_buffer[j] = in1[i + j]; |
65 | | - } |
| 56 | +// TRIPCOUNT identifier |
| 57 | +const int c_size = DATA_SIZE; |
66 | 58 |
|
67 | | - read2: |
68 | | - for (int j = 0; j < chunk_size; j++) { |
| 59 | +static void read_input(unsigned int* in, hls::stream<unsigned int>& inStream, int size) { |
| 60 | +// Auto-pipeline is going to apply pipeline to this loop |
| 61 | +mem_rd: |
| 62 | + for (int i = 0; i < size; i++) { |
69 | 63 | #pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size |
70 | | - v2_buffer[j] = in2[i + j]; |
71 | | - } |
| 64 | + inStream << in[i]; |
| 65 | + } |
| 66 | +} |
72 | 67 |
|
73 | | - // PIPELINE pragma reduces the initiation interval for loop by allowing the |
74 | | - // concurrent executions of operations |
75 | | - vadd: |
76 | | - for (int j = 0; j < chunk_size; j++) { |
| 68 | +static void compute_add(hls::stream<unsigned int>& inStream1, |
| 69 | + hls::stream<unsigned int>& inStream2, |
| 70 | + hls::stream<unsigned int>& outStream, |
| 71 | + int size) { |
| 72 | +// Auto-pipeline is going to apply pipeline to this loop |
| 73 | +execute: |
| 74 | + for (int i = 0; i < size; i++) { |
77 | 75 | #pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size |
78 | | - // perform vector addition |
79 | | - vout_buffer[j] = v1_buffer[j] + v2_buffer[j]; |
80 | | - } |
| 76 | + outStream << (inStream1.read() + inStream2.read()); |
| 77 | + } |
| 78 | +} |
81 | 79 |
|
82 | | - // burst write the result |
83 | | - write: |
84 | | - for (int j = 0; j < chunk_size; j++) { |
| 80 | +static void write_result(unsigned int* out, hls::stream<unsigned int>& outStream, int size) { |
| 81 | +// Auto-pipeline is going to apply pipeline to this loop |
| 82 | +mem_wr: |
| 83 | + for (int i = 0; i < size; i++) { |
85 | 84 | #pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size |
86 | | - out_r[i + j] = vout_buffer[j]; |
87 | | - } |
| 85 | + out[i] = outStream.read(); |
88 | 86 | } |
89 | 87 | } |
| 88 | + |
| 89 | +extern "C" { |
| 90 | +/* |
| 91 | + Vector Addition Kernel Implementation using dataflow |
| 92 | + Arguments: |
| 93 | + in1 (input) --> Input Vector 1 |
| 94 | + in2 (input) --> Input Vector 2 |
| 95 | + out (output) --> Output Vector |
| 96 | + size (input) --> Size of Vector in Integer |
| 97 | + */ |
| 98 | +void vadd(unsigned int* in1, unsigned int* in2, unsigned int* out, int size) { |
| 99 | + static hls::stream<unsigned int> inStream1("input_stream_1"); |
| 100 | + static hls::stream<unsigned int> inStream2("input_stream_2"); |
| 101 | + static hls::stream<unsigned int> outStream("output_stream"); |
| 102 | + |
| 103 | +#pragma HLS INTERFACE m_axi port = in1 bundle = gmem0 |
| 104 | +#pragma HLS INTERFACE m_axi port = in2 bundle = gmem1 |
| 105 | +#pragma HLS INTERFACE m_axi port = out bundle = gmem0 |
| 106 | + |
| 107 | +#pragma HLS dataflow |
| 108 | + // dataflow pragma instruct compiler to run following three APIs in parallel |
| 109 | + read_input(in1, inStream1, size); |
| 110 | + read_input(in2, inStream2, size); |
| 111 | + compute_add(inStream1, inStream2, outStream, size); |
| 112 | + write_result(out, outStream, size); |
| 113 | +} |
90 | 114 | } |
0 commit comments