FPGA: Update MatrixReadPipeToDDR to use ptr_annotations (#2487)

justin-rosner · web-flow · commit 885e033e6682 · 2024-09-18T09:16:50.000+02:00
A recent functional change to the compiler means that it will now correctly identify memory dependences. As a side effect, this will now cause for the compiler to emit a message that it is unable to achieve a user specified II for the MatrixReadPipeToDDR function in memory_transfers.hpp.

To regain this performance we can use annotated_ptr's in the SYCL HLS flow to specify a larger interface width which will allow for the compiler to coalesce stores to memory, thus resulting in being able to achieve the user specified II again.
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/pca/src/memory_transfers.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/pca/src/memory_transfers.hpp
@@ -5,6 +5,11 @@
 #include "tuple.hpp"
 #include "unrolled_loop.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
+
+constexpr int BL0 = 0;
+
 /*
   Read matrix_count matrices of type TT from DDR by bursts of num_elem_per_bank
   elements, and write the matrices to the "MatrixPipe" pipe num_elem_per_bank by
@@ -66,7 +71,12 @@ template <typename TT,            // Datatype of the elements of the matrix
           typename MatrixPipe     // Input matrix
           >
 void MatrixReadPipeToDDR(
-    TT* matrix_ptr,    // Output matrix pointer
+#if defined (IS_BSP)
+    TT matrix_ptr,  // Output matrix pointer
+# else
+    annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
+                                          dwidth<512>})> matrix_ptr,
+#endif
     int matrix_count,  // Number of matrix to write to DDR
     int repetitions    // Number of time to read the same matrix to the pipe
 ) {
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/pca/src/pca.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/pca/src/pca.hpp
@@ -11,6 +11,9 @@
 #include "streaming_eigen.hpp"
 #include "tuple.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
+
 // Forward declare the kernel and pipe names
 // (This prevents unwanted name mangling in the optimization report.)
 class InputMatrixFromDDRToLocalMem;
@@ -115,6 +118,13 @@ void PCAKernel(
     std::terminate();
   }
 
+#if not defined (IS_BSP)
+    constexpr int BL0 = 0;
+    using PtrAnn = annotated_ptr<T, decltype(properties{buffer_location<BL0>,
+                                                        dwidth<512>})>;
+    PtrAnn eigen_vectors_device_ptr(eigen_vectors_device);
+#endif
+
   // Check that the malloc succeeded.
   if (input_matrix_device == nullptr) {
     std::cerr << "Error when allocating the input matrix." << std::endl;
@@ -184,12 +194,19 @@ void PCAKernel(
             rank_deficient_flag_device, matrix_count, repetitions);
       });
 
-  // Write the Eigen vectors from local memory to FPGA DDR
+  // Write the Eigen vectors from local memory to FPGA DDR. If we have USM
+  // device allocations then we want to use eigen_vectors_device, but if we
+  // have USM shared allocations then we want to use eigen_vectors_device_ptr.
   auto eigen_vectors_event = q.single_task<EigenVectorsFromLocalMemToDDR>([=
   ]() [[intel::kernel_args_restrict]] {
     MatrixReadPipeToDDR<T, k_features_count, k_features_count,
                         kNumElementsPerDDRBurst, EigenVectorsPipe>(
-        eigen_vectors_device, matrix_count, repetitions);
+#if defined (IS_BSP)
+        eigen_vectors_device,
+#else
+        eigen_vectors_device_ptr,
+#endif
+        matrix_count, repetitions);
   });
 
   // Wait for the completion of the pipeline
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qrd/src/memory_transfers.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qrd/src/memory_transfers.hpp
@@ -5,6 +5,11 @@
 #include "constexpr_math.hpp"
 #include "unrolled_loop.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
+
+constexpr int BL0 = 0;
+
 /*
   Read matrix_count matrices of type TT from DDR by bursts of num_elem_per_bank
   elements, and write the matrices to the "MatrixPipe" pipe num_elem_per_bank by
@@ -120,7 +125,12 @@ template <typename TT,           // Datatype of the elements of the matrix
           typename MatrixPipe    // Input matrix
           >
 void MatrixReadPipeToDDR(
+#if defined (IS_BSP)
     TT* matrix_ptr,  // Output matrix pointer
+# else
+    annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
+                                          dwidth<512>})> matrix_ptr,
+#endif
     int matrix_count,// Number of matrix to write to DDR
     int repetitions  // Number of time to read the same matrix to the pipe
     ) {
@@ -146,8 +156,8 @@ void MatrixReadPipeToDDR(
   sycl::ext::intel::device_ptr<TT> matrix_ptr_located(matrix_ptr);
 #else
   // Device pointers are not supported when targeting an FPGA 
-  // family/part
-  TT* matrix_ptr_located(matrix_ptr);
+  // family/part. We want to use the ptr_annotation that was definied in qrd.hpp
+  auto matrix_ptr_located = matrix_ptr;
 #endif  
 
 
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qrd/src/qrd.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qrd/src/qrd.hpp
@@ -15,6 +15,9 @@
 #include "streaming_qrd.hpp"
 #include "tuple.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
+
 // Forward declare the kernel and pipe names
 // (This prevents unwanted name mangling in the optimization report.)
 class QRDDDRToLocalMem;
@@ -68,8 +71,13 @@ void QRDecompositionImpl(
 #else
   // malloc_device are not supported when targetting an FPGA part/family
   TT *a_device = sycl::malloc_shared<TT>(kAMatrixSize * matrix_count, q);
-  TT *q_device = sycl::malloc_shared<TT>(kQMatrixSize * matrix_count, q);
   TT *r_device = sycl::malloc_shared<TT>(kRMatrixSize * matrix_count, q);
+
+  constexpr int BL0 = 0;
+  using PtrAnn = annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
+                                                       dwidth<512>})>;
+  TT *q_device = sycl::malloc_shared<TT>(kQMatrixSize * matrix_count, q);
+  PtrAnn q_device_ptr(q_device);
 #endif  
 
   q.memcpy(a_device, a_matrix.data(), kAMatrixSize * matrix_count
@@ -96,7 +104,13 @@ void QRDecompositionImpl(
     // Read the Q matrix from the QMatrixPipe pipe and copy it to the
     // FPGA DDR
     MatrixReadPipeToDDR<TT, rows, columns, kNumElementsPerDDRBurst,
-                        QMatrixPipe>(q_device, matrix_count, repetitions);
+                        QMatrixPipe>(
+#if defined (IS_BSP)
+                          q_device,
+#else
+                          q_device_ptr,
+#endif
+                          matrix_count, repetitions);
   });
 
   auto r_event = q.single_task<QRDLocalMemToDDRR>([=
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qri/src/memory_transfers.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qri/src/memory_transfers.hpp
@@ -5,6 +5,11 @@
 #include "constexpr_math.hpp"
 #include "unrolled_loop.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
+
+constexpr int BL0 = 0;
+
 /*
   Read matrix_count matrices of type TT from DDR by bursts of num_elem_per_bank
   elements, and write the matrices to the "MatrixPipe" pipe num_elem_per_bank by
@@ -120,7 +125,12 @@ template <typename TT,           // Datatype of the elements of the matrix
           typename MatrixPipe    // Input matrix
           >
 void MatrixReadPipeToDDR(
-    TT* matrix_ptr,  // Output matrix pointer
+#if defined (IS_BSP)
+    TT matrix_ptr,  // Output matrix pointer
+# else
+    annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
+                                          dwidth<512>})> matrix_ptr,
+#endif
     int matrix_count,// Number of matrix to write to DDR
     int repetitions  // Number of time to read the same matrix to the pipe
     ) {
@@ -146,8 +156,8 @@ void MatrixReadPipeToDDR(
   sycl::ext::intel::device_ptr<TT> matrix_ptr_located(matrix_ptr);
 #else
   // Device pointers are not supported when targeting an FPGA 
-  // family/part
-  TT* matrix_ptr_located(matrix_ptr);
+  // family/part. We want to use the ptr_annotation that was definied in qri.hpp
+  auto matrix_ptr_located = matrix_ptr;
 #endif
 
   // Repeatedly read matrix_count matrices from the pipe and write them to DDR
diff --git a/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qri/src/qri.hpp b/DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qri/src/qri.hpp
@@ -15,6 +15,9 @@
 #include "streaming_qri.hpp"
 #include "tuple.hpp"
 
+using namespace sycl::ext::intel::experimental;
+using namespace sycl::ext::oneapi::experimental;
+
 // Forward declare the kernel and pipe names
 // (This prevents unwanted name mangling in the optimization report.)
 class QRIDDRToLocalMem;
@@ -74,7 +77,13 @@ void QRIImpl(
 #else
   // malloc_device are not supported when targetting an FPGA part/family
   TT *a_device = sycl::malloc_shared<TT>(kAMatrixSize * matrix_count, q);
-  TT *i_device = sycl::malloc_shared<TT>(kInverseMatrixSize * matrix_count, q);
+
+  constexpr int BL0 = 0;
+  using PtrAnn = annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
+                                                       dwidth<512>})>;
+  TT *i_device = sycl::malloc_shared<TT>(kInverseMatrixSize * matrix_count,
+                                             q);
+  PtrAnn i_device_ptr(i_device);
 #endif  
 
 
@@ -109,7 +118,13 @@ void QRIImpl(
       // Read the inverse matrix from the InverseMatrixPipe pipe and copy it
       // to the FPGA DDR
       MatrixReadPipeToDDR<TT, rows, columns, kNumElementsPerDDRBurst,
-              InverseMatrixPipe>(i_device, matrix_count, repetitions);
+              InverseMatrixPipe>(
+#if defined (IS_BSP)
+                i_device,
+#else
+                i_device_ptr,
+#endif
+                matrix_count, repetitions);
   });
 
   i_event.wait();
@@ -132,7 +147,7 @@ void QRIImpl(
 
   // Copy the Q and R matrices result from the FPGA DDR to the host memory
   q.memcpy(inverse_matrix.data(), i_device,
-               kInverseMatrixSize * matrix_count * sizeof(TT)).wait();
+           kInverseMatrixSize * matrix_count * sizeof(TT)).wait();
 
   // Clean allocated FPGA memory
     free(a_device, q);