FPGA: Tidy up Component Interfaces Comparison code sample (#2341)

whitepau · web-flow · commit 9efc8007cdfb · 2024-05-22T10:46:09.000+02:00
- improve comparibility across code versions
- remove unnecessary ready signal from csr-pipes version
- re-run clang-format
diff --git a/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/csr-pipes/src/vector_add.cpp b/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/csr-pipes/src/vector_add.cpp
@@ -1,40 +1,38 @@
 #include <iostream>
 
 // oneAPI headers
-#include <sycl/sycl.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
+#include <sycl/sycl.hpp>
+
 #include "exception_handler.hpp"
 
 // Forward declare the kernel name in the global scope. This is an FPGA best
 // practice that reduces name mangling in the optimization reports.
-class SimpleVAddPipes;
+class IDSimpleVAdd;
 
 // Forward declare pipe names to reduce name mangling
 class IDPipeA;
 class IDPipeB;
 class IDPipeC;
 
-constexpr int kVectorSize = 256;
-
 using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(
     sycl::ext::intel::experimental::ready_latency<0>));
 
 using InputPipeA =
-    sycl::ext::intel::experimental::pipe<IDPipeA, int, 0,
-                                         PipeProps>;
+    sycl::ext::intel::experimental::pipe<IDPipeA, int, 0, PipeProps>;
 using InputPipeB =
-    sycl::ext::intel::experimental::pipe<IDPipeB, int, 0,
-                                         PipeProps>;
+    sycl::ext::intel::experimental::pipe<IDPipeB, int, 0, PipeProps>;
 
-using CSRPipeProps = decltype(sycl::ext::oneapi::experimental::properties(
-    sycl::ext::intel::experimental::protocol_avalon_mm_uses_ready));
+using CsrOutProperties = decltype(sycl::ext::oneapi::experimental::properties(
+    sycl::ext::intel::experimental::protocol<
+        // Host doesn't care about possibly missing an update, so no need for
+        // protocol_name::avalon_mm_uses_ready
+        sycl::ext::intel::experimental::protocol_name::avalon_mm>));
 
-// this csr pipe will only be read from and written to once
 using OutputPipeC =
-    sycl::ext::intel::experimental::pipe<IDPipeC, int, 0,
-                                         CSRPipeProps>;
+    sycl::ext::intel::experimental::pipe<IDPipeC, int, 0, CsrOutProperties>;
 
-struct SimpleVAddKernelPipes {
+struct SimpleVAddKernel {
   int len;
 
   void operator()() const {
@@ -47,11 +45,15 @@ struct SimpleVAddKernelPipes {
       sum_total += sum;
     }
 
-    // Write to OutputPipeC only once per kernel invocation
+    // Write to OutputPipeC only once per kernel invocation. Since we requested
+    // protcol_avalon_mm instead of protocol_avalon_mm_uses_ready, this write is
+    // effectively non-blocking.
     OutputPipeC::write(sum_total);
   }
 };
 
+constexpr int kVectorSize = 256;
+
 int main() {
   try {
     // Use compile-time macros to select either:
@@ -69,32 +71,42 @@ int main() {
     // create the device queue
     sycl::queue q(selector, fpga_tools::exception_handler);
 
-    int count = kVectorSize;  // pass array size by value
+    auto device = q.get_device();
+
+    std::cout << "Running on device: "
+              << device.get_info<sycl::info::device::name>().c_str()
+              << std::endl;
+
+    // Vector size is a constant here, but it could be a run-time variable too.
+    int count = kVectorSize;
 
     int expected_sum = 0;
 
-    // push data into pipes
+    // push data into pipes before invoking kernel
     int *a = new int[count];
     int *b = new int[count];
     for (int i = 0; i < count; i++) {
       a[i] = i;
       b[i] = (count - i);
 
       expected_sum += (a[i] + b[i]);
-      // When writing to a host pipe in non kernel code, 
+      // When writing to a host pipe in non kernel code,
       // you must pass the sycl::queue as the first argument
       InputPipeA::write(q, a[i]);
       InputPipeB::write(q, b[i]);
     }
 
     std::cout << "Add two vectors of size " << count << std::endl;
 
-    q.single_task<SimpleVAddPipes>(SimpleVAddKernelPipes{count});
+    sycl::event e = q.single_task<IDSimpleVAdd>(SimpleVAddKernel{count});
 
-    // verify that outputs are correct
+    // Verify that outputs are correct, after the kernel has finished running.
+    // Since the write to OutputPipeC is non-blocking, no need to worry about
+    // deadlock.
+    e.wait();
     bool passed = true;
 
-    // only need to read from OutputPipeC once, since the kernel only wrote to it once
+    // Only read from OutputPipeC once, since the kernel only wrote to it once
     int calc = OutputPipeC::read(q);
     if (calc != expected_sum) {
       std::cout << "result " << calc << ", expected (" << expected_sum << ")"
diff --git a/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/mm-host/src/vector_add.cpp b/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/mm-host/src/vector_add.cpp
@@ -12,7 +12,7 @@ constexpr int kBL3 = 3;
 
 // Forward declare the kernel name in the global scope. This is an FPGA best
 // practice that reduces name mangling in the optimization reports.
-class SimpleVAdd;
+class IDSimpleVAdd;
 
 struct SimpleVAddKernel {
   sycl::ext::oneapi::experimental::annotated_arg<
@@ -108,7 +108,7 @@ int main() {
 
     std::cout << "Add two vectors of size " << count << std::endl;
 
-    q.single_task<SimpleVAdd>(SimpleVAddKernel{a, b, c, count}).wait();
+    q.single_task<IDSimpleVAdd>(SimpleVAddKernel{a, b, c, count}).wait();
 
     // verify that VC is correct
     bool passed = true;
diff --git a/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/naive/src/vector_add.cpp b/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/naive/src/vector_add.cpp
@@ -1,9 +1,11 @@
 #include <iostream>
 
 // oneAPI headers
-#include <sycl/sycl.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
+#include <sycl/sycl.hpp>
+
 #include "exception_handler.hpp"
+
 // Forward declare the kernel name in the global scope. This is an FPGA best
 // practice that reduces name mangling in the optimization reports.
 class IDSimpleVAdd;
@@ -27,8 +29,7 @@ struct SimpleVAddKernel {
 constexpr int kVectorSize = 256;
 
 int main() {
-
-  try{
+  try {
     // Use compile-time macros to select either:
     //  - the FPGA emulator device (CPU emulation of the FPGA)
     //  - the FPGA device (a real FPGA)
@@ -44,9 +45,16 @@ int main() {
     // create the device queue
     sycl::queue q(selector, fpga_tools::exception_handler);
 
-    int count = kVectorSize;  // pass array size by value
+    auto device = q.get_device();
+
+    std::cout << "Running on device: "
+              << device.get_info<sycl::info::device::name>().c_str()
+              << std::endl;
+
+    // Vector size is a constant here, but it could be a run-time variable too.
+    int count = kVectorSize;
 
-    // Create USM shared allocations in the specified buffer_location. 
+    // Create USM shared allocations in the specified buffer_location.
     // You can also use host allocations with malloc_host(...) API
     int *a = sycl::malloc_shared<int>(count, q);
     int *b = sycl::malloc_shared<int>(count, q);
@@ -58,9 +66,10 @@ int main() {
 
     std::cout << "Add two vectors of size " << count << std::endl;
 
-    q.single_task<IDSimpleVAdd>(SimpleVAddKernel{a, b, c, count}).wait();
+    sycl::event e = q.single_task<IDSimpleVAdd>(SimpleVAddKernel{a, b, c, count});
 
-    // verify that VC is correct
+    // Verify that outputs are correct, after the kernel has finished running.
+    e.wait();
     bool passed = true;
     for (int i = 0; i < count; i++) {
       int expected = a[i] + b[i];
diff --git a/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/pipes/src/vector_add.cpp b/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/pipes/src/vector_add.cpp
@@ -1,28 +1,31 @@
 #include <iostream>
 
 // oneAPI headers
-#include <sycl/sycl.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
+#include <sycl/sycl.hpp>
+
 #include "exception_handler.hpp"
 
-constexpr int kVectorSize = 256;
 // Forward declare the kernel name in the global scope. This is an FPGA best
 // practice that reduces name mangling in the optimization reports.
-class IDSimpleVAddPipes;
+class IDSimpleVAdd;
+
+// Forward declare pipe names to reduce name mangling
 class IDPipeA;
 class IDPipeB;
 class IDPipeC;
 
 using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(
     sycl::ext::intel::experimental::ready_latency<0>));
+
 using InputPipeA =
     sycl::ext::intel::experimental::pipe<IDPipeA, int, 0, PipeProps>;
 using InputPipeB =
     sycl::ext::intel::experimental::pipe<IDPipeB, int, 0, PipeProps>;
 using OutputPipeC =
     sycl::ext::intel::experimental::pipe<IDPipeC, int, 0, PipeProps>;
 
-struct SimpleVAddKernelPipes {
+struct SimpleVAddKernel {
   int len;
 
   void operator()() const {
@@ -35,6 +38,8 @@ struct SimpleVAddKernelPipes {
   }
 };
 
+constexpr int kVectorSize = 256;
+
 int main() {
   try {
     // Use compile-time macros to select either:
@@ -58,26 +63,28 @@ int main() {
               << device.get_info<sycl::info::device::name>().c_str()
               << std::endl;
 
-    int count = kVectorSize;  // pass array size by value
+    // Vector size is a constant here, but it could be a run-time variable too.
+    int count = kVectorSize;
 
-    // push data into pipes before invoking kernel
+    // Push data into pipes before invoking kernel
     int *a = new int[count];
     int *b = new int[count];
     for (int i = 0; i < count; i++) {
       a[i] = i;
       b[i] = (count - i);
-      // When writing to a host pipe in non kernel code, 
+      // When writing to a host pipe in non kernel code,
       // you must pass the sycl::queue as the first argument
       InputPipeA::write(q, a[i]);
       InputPipeB::write(q, b[i]);
     }
 
     std::cout << "Add two vectors of size " << count << std::endl;
 
-    q.single_task<IDSimpleVAddPipes>(
-         SimpleVAddKernelPipes{count});
+    q.single_task<IDSimpleVAdd>(SimpleVAddKernel{count});
 
-    // verify that VC is correct
+    // Verify that outputs are correct. Do not wait for the kernel to complete,
+    // because the pipe reads are blocking. Therefore, waiting would cause
+    // deadlock.
     bool passed = true;
     for (int i = 0; i < count; i++) {
       int expected = a[i] + b[i];
diff --git a/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/streaming-invocation/src/vector_add.cpp b/DirectProgramming/C++SYCL_FPGA/Tutorials/Features/hls_flow_interfaces/component_interfaces_comparison/streaming-invocation/src/vector_add.cpp
@@ -1,8 +1,9 @@
 #include <iostream>
 
 // oneAPI headers
-#include <sycl/sycl.hpp>
 #include <sycl/ext/intel/fpga_extensions.hpp>
+#include <sycl/sycl.hpp>
+
 #include "exception_handler.hpp"
 
 // Forward declare the kernel name in the global scope. This is an FPGA best
@@ -65,9 +66,16 @@ int main() {
     // create the device queue
     sycl::queue q(selector, fpga_tools::exception_handler);
 
-    int count = kVectorSize;  // pass array size by value
+    auto device = q.get_device();
 
-    // Create USM shared allocations in the specified buffer_location. 
+    std::cout << "Running on device: "
+              << device.get_info<sycl::info::device::name>().c_str()
+              << std::endl;
+
+    // Vector size is a constant here, but it could be a run-time variable too.
+    int count = kVectorSize;
+
+    // Create USM shared allocations in the specified buffer_location.
     // You can also use host allocations with malloc_host(...) API
     int *a = sycl::malloc_shared<int>(count, q);
     int *b = sycl::malloc_shared<int>(count, q);
@@ -79,9 +87,10 @@ int main() {
 
     std::cout << "Add two vectors of size " << count << std::endl;
 
-    q.single_task<IDSimpleVAdd>(SimpleVAddKernel{a, b, c, count}).wait();
+    sycl::event e = q.single_task<IDSimpleVAdd>(SimpleVAddKernel{a, b, c, count});
 
-    // verify that VC is correct
+    // Verify that outputs are correct, after the kernel has finished running.
+    e.wait();
     bool passed = true;
     for (int i = 0; i < count; i++) {
       int expected = a[i] + b[i];
@@ -107,9 +116,6 @@ int main() {
                  "ensure that your system is plugged to an FPGA board that is "
                  "set up correctly"
               << std::endl;
-    std::cerr << "   If you are targeting the FPGA emulator, compile with "
-                 "-DFPGA_EMULATOR"
-              << std::endl;
     std::terminate();
   }
 }