Updating code with load-compute-store model

Saumya Garg · GitHub Enterprise · commit 092495b3812e · 2021-09-23T16:47:29.000+05:30
diff --git a/host/multiple_cus_asymmetrical/README.rst b/host/multiple_cus_asymmetrical/README.rst
@@ -3,7 +3,7 @@ Multiple Compute Units (Asymmetrical) (C)
 
 This is simple example of vector addition to demonstrate how to connect each compute unit to different banks and how to use these compute units in host applications
 
-**KEY CONCEPTS:** `Multiple Compute Units <https://www.xilinx.com/html_docs/xilinx2021_1/vitis_doc/opencl_programming.html#dqz1555367565037>`__
+**KEY CONCEPTS:** `Multiple Compute Units <https://www.xilinx.com/html_docs/xilinx2021_1/vitis_doc/opencl_programming.html#dqz1555367565037>`__, `Task Level Parallelism <https://www.xilinx.com/html_docs/xilinx2021_1/vitis_doc/optimizingperformance.html#cvc1523913889499>`__
 
 EXCLUDED PLATFORMS
 ------------------
@@ -56,16 +56,16 @@ Kernel can be connected to different banks using vadd.cfg file as below:
    nk=vadd:4:vadd_1.vadd_2.vadd_3.vadd_4
    sp=vadd_1.in1:DDR[0]
    sp=vadd_1.in2:DDR[0]
-   sp=vadd_1.out_r:DDR[0]
+   sp=vadd_1.out:DDR[0]
    sp=vadd_2.in1:DDR[1]
    sp=vadd_2.in2:DDR[1]
-   sp=vadd_2.out_r:DDR[1]
+   sp=vadd_2.out:DDR[1]
    sp=vadd_3.in1:PLRAM[0]
    sp=vadd_3.in2:PLRAM[0]
-   sp=vadd_3.out_r:PLRAM[0]
+   sp=vadd_3.out:PLRAM[0]
    sp=vadd_4.in1:PLRAM[1]
    sp=vadd_4.in2:PLRAM[1]
-   sp=vadd_4.out_r:PLRAM[1]
+   sp=vadd_4.out:PLRAM[1]
 
 Some of the vadd compute units are connected to DDR banks and some are
 connected to PLRAMs. ``nk`` option can be used to specify how many
@@ -90,4 +90,4 @@ The kernel object which is created above is very specific to ``vadd_1``
 compute unit. Using this Kernel Object, host can directly access to this
 fix compute unit.
 
-For more comprehensive documentation, `click here <http://xilinx.github.io/Vitis_Accel_Examples>`__.
+For more comprehensive documentation, `click here <http://xilinx.github.io/Vitis_Accel_Examples>`__.
diff --git a/host/multiple_cus_asymmetrical/description.json b/host/multiple_cus_asymmetrical/description.json
@@ -5,7 +5,8 @@
     ],
     "flow": "vitis",
     "key_concepts": [
-        "Multiple Compute Units"
+        "Multiple Compute Units",
+        "Task Level Parallelism"
     ], 
     "platform_blacklist": [
         "_u25_",
diff --git a/host/multiple_cus_asymmetrical/details.rst b/host/multiple_cus_asymmetrical/details.rst
@@ -13,16 +13,16 @@ Kernel can be connected to different banks using vadd.cfg file as below:
    nk=vadd:4:vadd_1.vadd_2.vadd_3.vadd_4
    sp=vadd_1.in1:DDR[0]
    sp=vadd_1.in2:DDR[0]
-   sp=vadd_1.out_r:DDR[0]
+   sp=vadd_1.out:DDR[0]
    sp=vadd_2.in1:DDR[1]
    sp=vadd_2.in2:DDR[1]
-   sp=vadd_2.out_r:DDR[1]
+   sp=vadd_2.out:DDR[1]
    sp=vadd_3.in1:PLRAM[0]
    sp=vadd_3.in2:PLRAM[0]
-   sp=vadd_3.out_r:PLRAM[0]
+   sp=vadd_3.out:PLRAM[0]
    sp=vadd_4.in1:PLRAM[1]
    sp=vadd_4.in2:PLRAM[1]
-   sp=vadd_4.out_r:PLRAM[1]
+   sp=vadd_4.out:PLRAM[1]
 
 Some of the vadd compute units are connected to DDR banks and some are
 connected to PLRAMs. ``nk`` option can be used to specify how many
diff --git a/host/multiple_cus_asymmetrical/qor.json b/host/multiple_cus_asymmetrical/qor.json
@@ -12,19 +12,15 @@
                     "check_warning": "false", 
                     "loops": [
                         {
-                            "name": "read1", 
+                            "name": "mem_rd", 
                             "PipelineII": "1"
                         }, 
                         {
-                            "name": "read2", 
+                            "name": "execute", 
                             "PipelineII": "1"
                         }, 
                         {
-                            "name": "vadd", 
-                            "PipelineII": "1"
-                        }, 
-                        {
-                            "name": "write", 
+                            "name": "mem_rw", 
                             "PipelineII": "1"
                         }
                     ]
diff --git a/host/multiple_cus_asymmetrical/src/vadd.cpp b/host/multiple_cus_asymmetrical/src/vadd.cpp
@@ -13,78 +13,102 @@
 * License for the specific language governing permissions and limitations
 * under the License.
 */
-// Work load of each CU
-#define BUFFER_SIZE 1024
-#define DATA_SIZE 4096
-
-// TRIPCOUNT indentifier
-const unsigned int c_len = DATA_SIZE / BUFFER_SIZE;
-const unsigned int c_size = BUFFER_SIZE;
 
-/*
-    Vector Addition Kernel Implementation
-    Arguments:
-        in1   (input)     --> Input Vector1
-        in2   (input)     --> Input Vector2
-        out_r   (output)    --> Output Vector
-        size  (input)     --> Size of Vector in Integer
-*/
+/*******************************************************************************
+Description:
+    This example uses the load/compute/store coding style which is generally
+    the most efficient for implementing kernels using HLS. The load and store
+    functions are responsible for moving data in and out of the kernel as
+    efficiently as possible. The core functionality is decomposed across one
+    of more compute functions. Whenever possible, the compute function should
+    pass data through HLS streams and should contain a single set of nested loops.
+    HLS stream objects are used to pass data between producer and consumer
+    functions. Stream read and write operations have a blocking behavior which
+    allows consumers and producers to synchronize with each other automatically.
+    The dataflow pragma instructs the compiler to enable task-level pipelining.
+    This is required for to load/compute/store functions to execute in a parallel
+    and pipelined manner. Here the kernel loads, computes and stores NUM_WORDS integer values per
+    clock cycle and is implemented as below:
+                                       _____________
+                                      |             |<----- Input Vector 1 from Global Memory
+                                      |  load_input |       __
+                                      |_____________|----->|  |
+                                       _____________       |  | in1_stream
+Input Vector 2 from Global Memory --->|             |      |__|
+                               __     |  load_input |        |
+                              |  |<---|_____________|        |
+                   in2_stream |  |     _____________         |
+                              |__|--->|             |<--------
+                                      | compute_add |      __
+                                      |_____________|---->|  |
+                                       ______________     |  | out_stream
+                                      |              |<---|__|
+                                      | store_result |
+                                      |______________|-----> Output result to Global Memory
 
-extern "C" {
-void vadd(const unsigned int* in1, // Read-Only Vector 1
-          const unsigned int* in2, // Read-Only Vector 2
-          unsigned int* out_r,     // Output Result
-          int size                 // Size in integer
-          ) {
-    unsigned int v1_buffer[BUFFER_SIZE];   // Local memory to store vector1
-    unsigned int v2_buffer[BUFFER_SIZE];   // Local memory to store vector2
-    unsigned int vout_buffer[BUFFER_SIZE]; // Local Memory to store result
+*******************************************************************************/
 
-    // Per iteration of this loop perform BUFFER_SIZE vector addition
-    for (int i = 0; i < size; i += BUFFER_SIZE) {
-#pragma HLS LOOP_TRIPCOUNT min = c_len max = c_len
-        int chunk_size = BUFFER_SIZE;
-        // boundary checks
-        if ((i + BUFFER_SIZE) > size) chunk_size = size - i;
+#include <stdint.h>
+#include <hls_stream.h>
 
-    // Transferring data in bursts hides the memory access latency as well as
-    // improves bandwidth utilization and efficiency of the memory controller.
-    // It is recommended to infer burst transfers from successive requests of data
-    // from consecutive address locations.
-    // A local memory vl_local is used for buffering the data from a single burst.
-    // The entire input vector is read in multiple bursts.
-    // The choice of LOCAL_MEM_SIZE depends on the specific applications and
-    // available on-chip memory on target FPGA.
-    // burst read of v1 and v2 vector from global memory
+#define DATA_SIZE 4096
 
-    // Auto-pipeline is going to apply pipeline to these loops
-    read1:
-        for (int j = 0; j < chunk_size; j++) {
-#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
-            v1_buffer[j] = in1[i + j];
-        }
+// TRIPCOUNT identifier
+const int c_size = DATA_SIZE;
 
-    read2:
-        for (int j = 0; j < chunk_size; j++) {
+static void read_input(unsigned int* in, hls::stream<unsigned int>& inStream, int size) {
+// Auto-pipeline is going to apply pipeline to this loop
+mem_rd:
+    for (int i = 0; i < size; i++) {
 #pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
-            v2_buffer[j] = in2[i + j];
-        }
+        inStream << in[i];
+    }
+}
 
-    // PIPELINE pragma reduces the initiation interval for loop by allowing the
-    // concurrent executions of operations
-    vadd:
-        for (int j = 0; j < chunk_size; j++) {
+static void compute_add(hls::stream<unsigned int>& inStream1,
+                        hls::stream<unsigned int>& inStream2,
+                        hls::stream<unsigned int>& outStream,
+                        int size) {
+// Auto-pipeline is going to apply pipeline to this loop
+execute:
+    for (int i = 0; i < size; i++) {
 #pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
-            // perform vector addition
-            vout_buffer[j] = v1_buffer[j] + v2_buffer[j];
-        }
+        outStream << (inStream1.read() + inStream2.read());
+    }
+}
 
-    // burst write the result
-    write:
-        for (int j = 0; j < chunk_size; j++) {
+static void write_result(unsigned int* out, hls::stream<unsigned int>& outStream, int size) {
+// Auto-pipeline is going to apply pipeline to this loop
+mem_wr:
+    for (int i = 0; i < size; i++) {
 #pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
-            out_r[i + j] = vout_buffer[j];
-        }
+        out[i] = outStream.read();
     }
 }
+
+extern "C" {
+/*
+    Vector Addition Kernel Implementation using dataflow
+    Arguments:
+        in1   (input)  --> Input Vector 1
+        in2   (input)  --> Input Vector 2
+        out  (output) --> Output Vector
+        size (input)  --> Size of Vector in Integer
+   */
+void vadd(unsigned int* in1, unsigned int* in2, unsigned int* out, int size) {
+    static hls::stream<unsigned int> inStream1("input_stream_1");
+    static hls::stream<unsigned int> inStream2("input_stream_2");
+    static hls::stream<unsigned int> outStream("output_stream");
+
+#pragma HLS INTERFACE m_axi port = in1 bundle = gmem0
+#pragma HLS INTERFACE m_axi port = in2 bundle = gmem1
+#pragma HLS INTERFACE m_axi port = out bundle = gmem0
+
+#pragma HLS dataflow
+    // dataflow pragma instruct compiler to run following three APIs in parallel
+    read_input(in1, inStream1, size);
+    read_input(in2, inStream2, size);
+    compute_add(inStream1, inStream2, outStream, size);
+    write_result(out, outStream, size);
+}
 }
diff --git a/host/multiple_cus_asymmetrical/vadd.cfg b/host/multiple_cus_asymmetrical/vadd.cfg
@@ -1,14 +1,14 @@
 [connectivity]
 sp=vadd_1.in1:DDR[0]
 sp=vadd_1.in2:DDR[0]
-sp=vadd_1.out_r:DDR[0]
+sp=vadd_1.out:DDR[0]
 sp=vadd_2.in1:DDR[1]
 sp=vadd_2.in2:DDR[1]
-sp=vadd_2.out_r:DDR[1]
+sp=vadd_2.out:DDR[1]
 sp=vadd_3.in1:PLRAM[0]
 sp=vadd_3.in2:PLRAM[0]
-sp=vadd_3.out_r:PLRAM[0]
+sp=vadd_3.out:PLRAM[0]
 sp=vadd_4.in1:PLRAM[1]
 sp=vadd_4.in2:PLRAM[1]
-sp=vadd_4.out_r:PLRAM[1]
+sp=vadd_4.out:PLRAM[1]
 nk=vadd:4
diff --git a/rtl_kernels/rtl_vadd/src/krnl_vadd/vadd_CModel.cpp b/rtl_kernels/rtl_vadd/src/krnl_vadd/vadd_CModel.cpp
@@ -42,7 +42,6 @@ extern "C" {
 */
 
 void krnl_vadd_rtl(uint32_t* a, uint32_t* b, uint32_t* c, ap_uint<32> length_r) {
-
     for (int i = 0; i < length_r; i++) c[i] = a[i] + b[i];
 }
 }

Original file line number	Diff line number	Diff line change
`@@ -12,19 +12,15 @@`
`12`	`12`	`"check_warning": "false",`
`13`	`13`	`"loops": [`
`14`	`14`	`{`
`15`		`- "name": "read1",`
	`15`	`+ "name": "mem_rd",`
`16`	`16`	`"PipelineII": "1"`
`17`	`17`	`},`
`18`	`18`	`{`
`19`		`- "name": "read2",`
	`19`	`+ "name": "execute",`
`20`	`20`	`"PipelineII": "1"`
`21`	`21`	`},`
`22`	`22`	`{`
`23`		`- "name": "vadd",`
`24`		`- "PipelineII": "1"`
`25`		`- },`
`26`		`- {`
`27`		`- "name": "write",`
	`23`	`+ "name": "mem_rw",`
`28`	`24`	`"PipelineII": "1"`
`29`	`25`	`}`
`30`	`26`	`]`
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,6 @@ extern "C" {`
`42`	`42`	`*/`
`43`	`43`
`44`	`44`	`void krnl_vadd_rtl(uint32_t* a, uint32_t* b, uint32_t* c, ap_uint<32> length_r) {`
`45`		`-`
`46`	`45`	`for (int i = 0; i < length_r; i++) c[i] = a[i] + b[i];`
`47`	`46`	`}`
`48`	`47`	`}`