Skip to content

Commit 6b80ec5

Browse files
authored
FPGA: Reorder submit order for DB11 and DB12 to get more stable timing/Throughput numbers (#2339)
To minimize the variation on the timing, it is important to start the 'first' kernel of a chain of kernels last. This ensures that the timing doesn't start until that kernel is ready to go. For DB12, there are 2 pipelines that are joined. If PRECISE_TIMING is defined, add a dummy start kernel that will trigger the two pipelines to start.
1 parent f9a6315 commit 6b80ec5

File tree

4 files changed

+87
-44
lines changed

4 files changed

+87
-44
lines changed

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/db/CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,12 @@ else()
125125
set(SF_SMALL_ARG )
126126
endif()
127127

128+
if (PRECISE_TIMING)
129+
set(PRECISE_TIMING_ARG -DPRECISE_TIMING)
130+
else()
131+
set(PRECISE_TIMING_ARG )
132+
endif()
133+
128134
# setting source file based on query version
129135
if(${QUERY} EQUAL 1)
130136
set(SOURCE_FILES ${SOURCE_FILES};src/query1/query1_kernel.cpp)
@@ -144,7 +150,7 @@ endif()
144150
set(USER_FPGA_FLAGS ${USER_FPGA_FLAGS};${SEED};${CLOCK_TARGET})
145151

146152
# Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
147-
set(USER_FLAGS ${USER_FLAGS};-DQUERY=${QUERY};${SF_SMALL_ARG})
153+
set(USER_FLAGS ${USER_FLAGS};-DQUERY=${QUERY};${SF_SMALL_ARG};${PRECISE_TIMING_ARG})
148154

149155
# Use cmake -DUSER_INCLUDE_PATHS=<paths> to set extra paths for general
150156
# compilation.

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/db/src/query11/query11_kernel.cpp

Lines changed: 43 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -76,46 +76,6 @@ bool SubmitQuery11(queue& q, Database& dbinfo, std::string& nation,
7676
// start timer
7777
high_resolution_clock::time_point host_start = high_resolution_clock::now();
7878

79-
///////////////////////////////////////////////////////////////////////////
80-
//// ProducePartSupplier Kernel
81-
auto produce_ps_event = q.submit([&](handler& h) {
82-
// PARTSUPPLIER table accessors
83-
accessor ps_partkey_accessor(ps_partkey_buf, h, read_only);
84-
accessor ps_suppkey_accessor(ps_suppkey_buf, h, read_only);
85-
accessor ps_availqty_accessor(ps_availqty_buf, h, read_only);
86-
accessor ps_supplycost_accessor(ps_supplycost_buf, h, read_only);
87-
88-
// kernel to produce the PARTSUPPLIER table
89-
h.single_task<ProducePartSupplier>([=]() [[intel::kernel_args_restrict]] {
90-
[[intel::initiation_interval(1)]]
91-
for (size_t i = 0; i < ps_iters; i++) {
92-
// bulk read of data from global memory
93-
NTuple<kJoinWinSize, PartSupplierRow> data;
94-
95-
UnrolledLoop<0, kJoinWinSize>([&](auto j) {
96-
size_t idx = i * kJoinWinSize + j;
97-
bool in_range = idx < ps_rows;
98-
99-
DBIdentifier partkey = ps_partkey_accessor[idx];
100-
DBIdentifier suppkey = ps_suppkey_accessor[idx];
101-
int availqty = ps_availqty_accessor[idx];
102-
DBDecimal supplycost = ps_supplycost_accessor[idx];
103-
104-
data.get<j>() =
105-
PartSupplierRow(in_range, partkey, suppkey, availqty, supplycost);
106-
});
107-
108-
// write to pipe
109-
ProducePartSupplierPipe::write(
110-
PartSupplierRowPipeData(false, true, data));
111-
}
112-
113-
// tell the downstream kernel we are done producing data
114-
ProducePartSupplierPipe::write(PartSupplierRowPipeData(true, false));
115-
});
116-
});
117-
///////////////////////////////////////////////////////////////////////////
118-
11979
///////////////////////////////////////////////////////////////////////////
12080
//// JoinPartSupplierParts Kernel
12181
auto join_event = q.submit([&](handler& h) {
@@ -250,6 +210,47 @@ bool SubmitQuery11(queue& q, Database& dbinfo, std::string& nation,
250210
});
251211
///////////////////////////////////////////////////////////////////////////
252212

213+
// Must be last to ensure reliable timings
214+
///////////////////////////////////////////////////////////////////////////
215+
//// ProducePartSupplier Kernel
216+
auto produce_ps_event = q.submit([&](handler& h) {
217+
// PARTSUPPLIER table accessors
218+
accessor ps_partkey_accessor(ps_partkey_buf, h, read_only);
219+
accessor ps_suppkey_accessor(ps_suppkey_buf, h, read_only);
220+
accessor ps_availqty_accessor(ps_availqty_buf, h, read_only);
221+
accessor ps_supplycost_accessor(ps_supplycost_buf, h, read_only);
222+
223+
// kernel to produce the PARTSUPPLIER table
224+
h.single_task<ProducePartSupplier>([=]() [[intel::kernel_args_restrict]] {
225+
[[intel::initiation_interval(1)]]
226+
for (size_t i = 0; i < ps_iters; i++) {
227+
// bulk read of data from global memory
228+
NTuple<kJoinWinSize, PartSupplierRow> data;
229+
230+
UnrolledLoop<0, kJoinWinSize>([&](auto j) {
231+
size_t idx = i * kJoinWinSize + j;
232+
bool in_range = idx < ps_rows;
233+
234+
DBIdentifier partkey = ps_partkey_accessor[idx];
235+
DBIdentifier suppkey = ps_suppkey_accessor[idx];
236+
int availqty = ps_availqty_accessor[idx];
237+
DBDecimal supplycost = ps_supplycost_accessor[idx];
238+
239+
data.get<j>() =
240+
PartSupplierRow(in_range, partkey, suppkey, availqty, supplycost);
241+
});
242+
243+
// write to pipe
244+
ProducePartSupplierPipe::write(
245+
PartSupplierRowPipeData(false, true, data));
246+
}
247+
248+
// tell the downstream kernel we are done producing data
249+
ProducePartSupplierPipe::write(PartSupplierRowPipeData(true, false));
250+
});
251+
});
252+
///////////////////////////////////////////////////////////////////////////
253+
253254
// wait for kernels to finish
254255
produce_ps_event.wait();
255256
join_event.wait();
@@ -260,9 +261,9 @@ bool SubmitQuery11(queue& q, Database& dbinfo, std::string& nation,
260261
high_resolution_clock::time_point host_end = high_resolution_clock::now();
261262
duration<double, std::milli> diff = host_end - host_start;
262263

263-
// gather profiling info
264+
// gather profiling info from start of pipeline to end
264265
auto start_time =
265-
consume_sort_event
266+
produce_ps_event
266267
.get_profiling_info<info::event_profiling::command_start>();
267268
auto end_time = consume_sort_event
268269
.get_profiling_info<info::event_profiling::command_end>();

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/db/src/query12/pipe_types.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,13 @@ using LineItemProducerPipe =
117117
using JoinedProducerPipe =
118118
pipe<class JoinedProducerPipeClass, JoinedRowPipeData>;
119119

120+
#ifdef PRECISE_TIMING
121+
using OrdersProducerStartPipe =
122+
pipe<class OrdersProducerPipeStartClass, bool>;
123+
124+
using LineItemProducerStartPipe =
125+
pipe<class LineItemProducerPipeStartClass, bool>;
126+
127+
#endif
128+
120129
#endif /* __PIPE_TYPES_H__ */

DirectProgramming/C++SYCL_FPGA/ReferenceDesigns/db/src/query12/query12_kernel.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class LineItemProducer;
1616
class OrdersProducer;
1717
class Join;
1818
class Compute;
19+
class StartProduction;
1920

2021
bool SubmitQuery12(queue& q, Database& dbinfo, DBDate low_date,
2122
DBDate high_date, int shipmode1, int shipmode2,
@@ -60,6 +61,9 @@ bool SubmitQuery12(queue& q, Database& dbinfo, DBDate low_date,
6061
accessor l_receiptdate_accessor(l_receiptdate_buf, h, read_only);
6162

6263
h.single_task<LineItemProducer>([=]() [[intel::kernel_args_restrict]] {
64+
#ifdef PRECISE_TIMING
65+
(void) LineItemProducerStartPipe::read();
66+
#endif
6367
[[intel::initiation_interval(1)]]
6468
for (size_t i = 0; i < l_iters + 1; i++) {
6569
bool done = (i == l_iters);
@@ -99,6 +103,9 @@ bool SubmitQuery12(queue& q, Database& dbinfo, DBDate low_date,
99103
accessor o_orderpriority_accessor(o_orderpriority_buf, h, read_only);
100104

101105
h.single_task<OrdersProducer>([=]() [[intel::kernel_args_restrict]] {
106+
#ifdef PRECISE_TIMING
107+
(void) OrdersProducerStartPipe::read();
108+
#endif
102109
[[intel::initiation_interval(1)]]
103110
for (size_t i = 0; i < o_iters + 1; i++) {
104111
bool done = (i == o_iters);
@@ -237,7 +244,20 @@ bool SubmitQuery12(queue& q, Database& dbinfo, DBDate low_date,
237244
});
238245
/////////////////////////////////////////////////////////////////////////////
239246

240-
// wait for the Compute kernel to finish
247+
#ifdef PRECISE_TIMING
248+
// Started last to get more reliable timings
249+
/////////////////////////////////////////////////////////////////////////////
250+
//// Start Production - Ensure accurate timings
251+
auto start_production_event = q.submit([&](handler& h) {
252+
h.single_task<StartProduction>([=]() [[intel::kernel_args_restrict]] {
253+
OrdersProducerStartPipe::write(true);
254+
LineItemProducerStartPipe::write(true);
255+
});
256+
});
257+
/////////////////////////////////////////////////////////////////////////////
258+
259+
start_production_event.wait();
260+
#endif
241261
produce_orders_event.wait();
242262
produce_lineitem_event.wait();
243263
join_event.wait();
@@ -248,8 +268,15 @@ bool SubmitQuery12(queue& q, Database& dbinfo, DBDate low_date,
248268
duration<double, std::milli> diff = host_end - host_start;
249269

250270
//// gather profiling info
271+
#ifdef PRECISE_TIMING
272+
// Measure complete timing from start of pipeline to the end.
273+
auto start_time =
274+
start_production_event.get_profiling_info<info::event_profiling::command_start>();
275+
#else
276+
// Just measure computation
251277
auto start_time =
252278
compute_event.get_profiling_info<info::event_profiling::command_start>();
279+
#endif // PRECISE_TIMING
253280
auto end_time =
254281
compute_event.get_profiling_info<info::event_profiling::command_end>();
255282

0 commit comments

Comments
 (0)