Skip to content

Commit 885e033

Browse files
FPGA: Update MatrixReadPipeToDDR to use ptr_annotations (#2487)
A recent functional change to the compiler means that it will now correctly identify memory dependences. As a side effect, this will now cause for the compiler to emit a message that it is unable to achieve a user specified II for the MatrixReadPipeToDDR function in memory_transfers.hpp. To regain this performance we can use annotated_ptr's in the SYCL HLS flow to specify a larger interface width which will allow for the compiler to coalesce stores to memory, thus resulting in being able to achieve the user specified II again.
1 parent ccfae3f commit 885e033

File tree

6 files changed

+89
-13
lines changed

6 files changed

+89
-13
lines changed

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/pca/src/memory_transfers.hpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
#include "tuple.hpp"
66
#include "unrolled_loop.hpp"
77

8+
using namespace sycl::ext::intel::experimental;
9+
using namespace sycl::ext::oneapi::experimental;
10+
11+
constexpr int BL0 = 0;
12+
813
/*
914
Read matrix_count matrices of type TT from DDR by bursts of num_elem_per_bank
1015
elements, and write the matrices to the "MatrixPipe" pipe num_elem_per_bank by
@@ -66,7 +71,12 @@ template <typename TT, // Datatype of the elements of the matrix
6671
typename MatrixPipe // Input matrix
6772
>
6873
void MatrixReadPipeToDDR(
69-
TT* matrix_ptr, // Output matrix pointer
74+
#if defined (IS_BSP)
75+
TT matrix_ptr, // Output matrix pointer
76+
# else
77+
annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
78+
dwidth<512>})> matrix_ptr,
79+
#endif
7080
int matrix_count, // Number of matrix to write to DDR
7181
int repetitions // Number of time to read the same matrix to the pipe
7282
) {

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/pca/src/pca.hpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
#include "streaming_eigen.hpp"
1212
#include "tuple.hpp"
1313

14+
using namespace sycl::ext::intel::experimental;
15+
using namespace sycl::ext::oneapi::experimental;
16+
1417
// Forward declare the kernel and pipe names
1518
// (This prevents unwanted name mangling in the optimization report.)
1619
class InputMatrixFromDDRToLocalMem;
@@ -115,6 +118,13 @@ void PCAKernel(
115118
std::terminate();
116119
}
117120

121+
#if not defined (IS_BSP)
122+
constexpr int BL0 = 0;
123+
using PtrAnn = annotated_ptr<T, decltype(properties{buffer_location<BL0>,
124+
dwidth<512>})>;
125+
PtrAnn eigen_vectors_device_ptr(eigen_vectors_device);
126+
#endif
127+
118128
// Check that the malloc succeeded.
119129
if (input_matrix_device == nullptr) {
120130
std::cerr << "Error when allocating the input matrix." << std::endl;
@@ -184,12 +194,19 @@ void PCAKernel(
184194
rank_deficient_flag_device, matrix_count, repetitions);
185195
});
186196

187-
// Write the Eigen vectors from local memory to FPGA DDR
197+
// Write the Eigen vectors from local memory to FPGA DDR. If we have USM
198+
// device allocations then we want to use eigen_vectors_device, but if we
199+
// have USM shared allocations then we want to use eigen_vectors_device_ptr.
188200
auto eigen_vectors_event = q.single_task<EigenVectorsFromLocalMemToDDR>([=
189201
]() [[intel::kernel_args_restrict]] {
190202
MatrixReadPipeToDDR<T, k_features_count, k_features_count,
191203
kNumElementsPerDDRBurst, EigenVectorsPipe>(
192-
eigen_vectors_device, matrix_count, repetitions);
204+
#if defined (IS_BSP)
205+
eigen_vectors_device,
206+
#else
207+
eigen_vectors_device_ptr,
208+
#endif
209+
matrix_count, repetitions);
193210
});
194211

195212
// Wait for the completion of the pipeline

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qrd/src/memory_transfers.hpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
#include "constexpr_math.hpp"
66
#include "unrolled_loop.hpp"
77

8+
using namespace sycl::ext::intel::experimental;
9+
using namespace sycl::ext::oneapi::experimental;
10+
11+
constexpr int BL0 = 0;
12+
813
/*
914
Read matrix_count matrices of type TT from DDR by bursts of num_elem_per_bank
1015
elements, and write the matrices to the "MatrixPipe" pipe num_elem_per_bank by
@@ -120,7 +125,12 @@ template <typename TT, // Datatype of the elements of the matrix
120125
typename MatrixPipe // Input matrix
121126
>
122127
void MatrixReadPipeToDDR(
128+
#if defined (IS_BSP)
123129
TT* matrix_ptr, // Output matrix pointer
130+
# else
131+
annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
132+
dwidth<512>})> matrix_ptr,
133+
#endif
124134
int matrix_count,// Number of matrix to write to DDR
125135
int repetitions // Number of time to read the same matrix to the pipe
126136
) {
@@ -146,8 +156,8 @@ void MatrixReadPipeToDDR(
146156
sycl::ext::intel::device_ptr<TT> matrix_ptr_located(matrix_ptr);
147157
#else
148158
// Device pointers are not supported when targeting an FPGA
149-
// family/part
150-
TT* matrix_ptr_located(matrix_ptr);
159+
// family/part. We want to use the ptr_annotation that was definied in qrd.hpp
160+
auto matrix_ptr_located = matrix_ptr;
151161
#endif
152162

153163

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qrd/src/qrd.hpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
#include "streaming_qrd.hpp"
1616
#include "tuple.hpp"
1717

18+
using namespace sycl::ext::intel::experimental;
19+
using namespace sycl::ext::oneapi::experimental;
20+
1821
// Forward declare the kernel and pipe names
1922
// (This prevents unwanted name mangling in the optimization report.)
2023
class QRDDDRToLocalMem;
@@ -68,8 +71,13 @@ void QRDecompositionImpl(
6871
#else
6972
// malloc_device are not supported when targetting an FPGA part/family
7073
TT *a_device = sycl::malloc_shared<TT>(kAMatrixSize * matrix_count, q);
71-
TT *q_device = sycl::malloc_shared<TT>(kQMatrixSize * matrix_count, q);
7274
TT *r_device = sycl::malloc_shared<TT>(kRMatrixSize * matrix_count, q);
75+
76+
constexpr int BL0 = 0;
77+
using PtrAnn = annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
78+
dwidth<512>})>;
79+
TT *q_device = sycl::malloc_shared<TT>(kQMatrixSize * matrix_count, q);
80+
PtrAnn q_device_ptr(q_device);
7381
#endif
7482

7583
q.memcpy(a_device, a_matrix.data(), kAMatrixSize * matrix_count
@@ -96,7 +104,13 @@ void QRDecompositionImpl(
96104
// Read the Q matrix from the QMatrixPipe pipe and copy it to the
97105
// FPGA DDR
98106
MatrixReadPipeToDDR<TT, rows, columns, kNumElementsPerDDRBurst,
99-
QMatrixPipe>(q_device, matrix_count, repetitions);
107+
QMatrixPipe>(
108+
#if defined (IS_BSP)
109+
q_device,
110+
#else
111+
q_device_ptr,
112+
#endif
113+
matrix_count, repetitions);
100114
});
101115

102116
auto r_event = q.single_task<QRDLocalMemToDDRR>([=

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qri/src/memory_transfers.hpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
#include "constexpr_math.hpp"
66
#include "unrolled_loop.hpp"
77

8+
using namespace sycl::ext::intel::experimental;
9+
using namespace sycl::ext::oneapi::experimental;
10+
11+
constexpr int BL0 = 0;
12+
813
/*
914
Read matrix_count matrices of type TT from DDR by bursts of num_elem_per_bank
1015
elements, and write the matrices to the "MatrixPipe" pipe num_elem_per_bank by
@@ -120,7 +125,12 @@ template <typename TT, // Datatype of the elements of the matrix
120125
typename MatrixPipe // Input matrix
121126
>
122127
void MatrixReadPipeToDDR(
123-
TT* matrix_ptr, // Output matrix pointer
128+
#if defined (IS_BSP)
129+
TT matrix_ptr, // Output matrix pointer
130+
# else
131+
annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
132+
dwidth<512>})> matrix_ptr,
133+
#endif
124134
int matrix_count,// Number of matrix to write to DDR
125135
int repetitions // Number of time to read the same matrix to the pipe
126136
) {
@@ -146,8 +156,8 @@ void MatrixReadPipeToDDR(
146156
sycl::ext::intel::device_ptr<TT> matrix_ptr_located(matrix_ptr);
147157
#else
148158
// Device pointers are not supported when targeting an FPGA
149-
// family/part
150-
TT* matrix_ptr_located(matrix_ptr);
159+
// family/part. We want to use the ptr_annotation that was definied in qri.hpp
160+
auto matrix_ptr_located = matrix_ptr;
151161
#endif
152162

153163
// Repeatedly read matrix_count matrices from the pipe and write them to DDR

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/qri/src/qri.hpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
#include "streaming_qri.hpp"
1616
#include "tuple.hpp"
1717

18+
using namespace sycl::ext::intel::experimental;
19+
using namespace sycl::ext::oneapi::experimental;
20+
1821
// Forward declare the kernel and pipe names
1922
// (This prevents unwanted name mangling in the optimization report.)
2023
class QRIDDRToLocalMem;
@@ -74,7 +77,13 @@ void QRIImpl(
7477
#else
7578
// malloc_device are not supported when targetting an FPGA part/family
7679
TT *a_device = sycl::malloc_shared<TT>(kAMatrixSize * matrix_count, q);
77-
TT *i_device = sycl::malloc_shared<TT>(kInverseMatrixSize * matrix_count, q);
80+
81+
constexpr int BL0 = 0;
82+
using PtrAnn = annotated_ptr<TT, decltype(properties{buffer_location<BL0>,
83+
dwidth<512>})>;
84+
TT *i_device = sycl::malloc_shared<TT>(kInverseMatrixSize * matrix_count,
85+
q);
86+
PtrAnn i_device_ptr(i_device);
7887
#endif
7988

8089

@@ -109,7 +118,13 @@ void QRIImpl(
109118
// Read the inverse matrix from the InverseMatrixPipe pipe and copy it
110119
// to the FPGA DDR
111120
MatrixReadPipeToDDR<TT, rows, columns, kNumElementsPerDDRBurst,
112-
InverseMatrixPipe>(i_device, matrix_count, repetitions);
121+
InverseMatrixPipe>(
122+
#if defined (IS_BSP)
123+
i_device,
124+
#else
125+
i_device_ptr,
126+
#endif
127+
matrix_count, repetitions);
113128
});
114129

115130
i_event.wait();
@@ -132,7 +147,7 @@ void QRIImpl(
132147

133148
// Copy the Q and R matrices result from the FPGA DDR to the host memory
134149
q.memcpy(inverse_matrix.data(), i_device,
135-
kInverseMatrixSize * matrix_count * sizeof(TT)).wait();
150+
kInverseMatrixSize * matrix_count * sizeof(TT)).wait();
136151

137152
// Clean allocated FPGA memory
138153
free(a_device, q);

0 commit comments

Comments
 (0)