Skip to content

Commit e761259

Browse files
committed
refactoring
1 parent f115a93 commit e761259

File tree

8 files changed

+82
-57
lines changed

8 files changed

+82
-57
lines changed

benchmark/conv1d_bench.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,6 @@
77
static constexpr auto __WG_SIZE = 1024;
88
static constexpr real_t __INIT_VALUE = 7.3;
99

10-
auto
11-
sycl_alloc(size_t size, sycl::queue &q) {
12-
return sycl::malloc_device<real_t>(size, q);
13-
}
14-
1510
// ==========================================
1611
struct BenchmarkConv1dParams {
1712
int batch_size;

src/CMakeLists.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
add_subdirectory(config)
2-
# add_subdirectory(core)
32

43
function(add_bkma_executable name)
54
add_executable(${name} ${name}.cpp)
65

76
target_link_libraries(${name}
87
PUBLIC
98
bkma::config)
10-
# bkma::core)
119

1210
target_include_directories(${name}
1311
PUBLIC
@@ -27,5 +25,5 @@ function(add_bkma_executable name)
2725
endfunction()
2826

2927
# Add executables
30-
# add_bkma_executable(advection)
28+
add_bkma_executable(advection)
3129
add_bkma_executable(conv1d)

src/advection.cpp

Lines changed: 19 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include <AdvectionParams.hpp>
22
#include <AdvectionSolver.hpp>
3-
#include <advectors.hpp>
43
#include <iostream>
54
#include <sycl/sycl.hpp>
65
#include <unique_ref.hpp>
@@ -15,16 +14,18 @@
1514
// ==========================================
1615
// returns duration for maxIter-1 iterations
1716
std::chrono::duration<double>
18-
advection(sycl::queue &Q, real_t* fidst_dev,
19-
sref::unique_ref<IAdvectorX> &advector, const AdvectionSolver &solver) {
17+
advection(sycl::queue &Q, span3d_t data, const AdvectionSolver &solver,
18+
BkmaOptimParams &optim_params) {
2019

2120
auto static const maxIter = solver.params.maxIter;
2221

2322
auto start = std::chrono::high_resolution_clock::now();
2423
// Time loop
2524
for (size_t t = 0; t < maxIter; ++t) {
26-
advector(Q, fidst_dev, solver);
25+
bkma_run<AdvectionSolver, BkmaImpl::AdaptiveWg>(Q, data, solver,
26+
optim_params);
2727
Q.wait();
28+
2829
} // end for t < T
2930
auto end = std::chrono::high_resolution_clock::now();
3031

@@ -42,24 +43,11 @@ main(int argc, char **argv) {
4243
ADVParamsNonCopyable strParams;// = ADVParamsNonCopyable();
4344
strParams.setup(configMap);
4445

45-
const auto run_on_gpu = strParams.gpu;
46-
47-
sycl::device d;
48-
if (run_on_gpu)
49-
try {
50-
d = sycl::device{sycl::gpu_selector_v};
51-
} catch (const sycl::exception e) {
52-
std::cout
53-
<< "GPU was requested but none is available, running kernels "
54-
"on the CPU\n"
55-
<< std::endl;
56-
d = sycl::device{sycl::cpu_selector_v};
57-
strParams.gpu = false;
58-
}
59-
else
60-
d = sycl::device{sycl::cpu_selector_v};
61-
62-
sycl::queue Q{d};
46+
const bool run_on_gpu = strParams.gpu;
47+
auto device = pick_device(run_on_gpu);
48+
strParams.gpu = device.is_gpu() ? true : false;
49+
50+
sycl::queue Q{device};
6351

6452
/* Display infos on current device */
6553
std::cout << "Using device: "
@@ -76,24 +64,21 @@ main(int argc, char **argv) {
7664

7765
/* Buffer for the distribution function containing the probabilities of
7866
having a particle at a particular speed and position, plus a fictive dim */
79-
real_t* fdist = sycl::malloc_device<real_t>(n0*n1*n2, Q);
67+
span3d_t data(sycl_alloc(n0*n1*n2, Q), n0, n1, n2);
8068
Q.wait();
81-
fill_buffer(Q, fdist, params);
69+
fill_buffer_adv(Q, data, params);
8270

8371
AdvectionSolver solver(params);
84-
auto advector = kernel_impl_factory(Q, strParams, solver);
72+
auto optim_params = create_optim_params<ADVParams>(Q, params);
8573

86-
auto elapsed_seconds = advection(Q, fdist, advector, solver);
74+
auto elapsed_seconds = advection(Q, data, solver, optim_params);
8775

88-
std::cout << "\nRESULTS_VALIDATION:" << std::endl;
89-
validate_result(Q, fdist, params);
76+
validate_result_adv(Q, data, params);
9077

91-
std::cout << "PERF_DIAGS:" << std::endl;
92-
std::cout << "elapsed_time: " << elapsed_seconds.count() << " s\n";
78+
auto const n_cells = n0 * n1 * n2 * (maxIter);
79+
print_perf(elapsed_seconds.count(), n_cells);
9380

94-
auto gcells = ((n0*n1*n2*(maxIter)) / elapsed_seconds.count()) / 1e9;
95-
std::cout << "upd_cells_per_sec: " << gcells << " Gcell/sec\n";
96-
std::cout << "estimated_throughput: " << gcells * sizeof(real_t) * 2
97-
<< " GB/s" << std::endl;
81+
sycl::free(data.data_handle(), Q);
82+
Q.wait();
9883
return 0;
9984
}

src/config/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ target_sources(config
1111
target_include_directories(config
1212
PUBLIC
1313
${CMAKE_SOURCE_DIR}/src/config
14+
${CMAKE_SOURCE_DIR}/src/tools
1415
${CMAKE_SOURCE_DIR}/src)
1516

1617
target_link_libraries(config PUBLIC mdspan)

src/conv1d.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,6 @@
88
#include <init.hpp>
99
#include <validation.hpp>
1010

11-
auto
12-
sycl_alloc(size_t size, sycl::queue &q) {
13-
return sycl::malloc_shared<real_t>(size, q);
14-
}
15-
1611
// ==========================================
1712
// ==========================================
1813
int
@@ -110,5 +105,7 @@ main(int argc, char **argv) {
110105
sycl::free(weight.data_handle(), Q);
111106
sycl::free(bias.data_handle(), Q);
112107
sycl::free(data.data_handle(), Q);
108+
Q.wait();
109+
113110
return 0;
114111
}

src/tools/init.hpp

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
#include <AdvectionParams.hpp>
33
#include <cmath>
44
#include <sycl/sycl.hpp>
5+
#include <bkma.hpp>
56

7+
// ==========================================
8+
// ==========================================
69
inline sycl::device
710
pick_device(bool run_on_gpu) {
811
sycl::device d;
@@ -21,27 +24,26 @@ pick_device(bool run_on_gpu) {
2124
d = sycl::device{sycl::cpu_selector_v};
2225

2326
return d;
24-
}
27+
} //end pick_device
2528

2629
// ==========================================
2730
// ==========================================
2831
void
29-
fill_buffer_adv(sycl::queue &q, real_t *fdist_dev, const ADVParams &params) {
32+
fill_buffer_adv(sycl::queue &q, span3d_t &data, const ADVParams &params) {
3033
const auto n0 = params.n0, n1 = params.n1, n2 = params.n2;
3134

3235
sycl::range r3d(n0, n1, n2);
3336
q.submit([&](sycl::handler &cgh) {
3437
cgh.parallel_for(r3d, [=](auto i) {
35-
span3d_t fdist(fdist_dev, n0, n1, n2);
3638
const size_t i0 = i[0];
3739
const size_t i1 = i[1];
3840
const size_t i2 = i[2];
3941

4042
real_t x = params.minRealX + i1 * params.dx;
41-
fdist(i0, i1, i2) = sycl::sin(4 * x * M_PI);
43+
data(i0, i1, i2) = sycl::sin(4 * x * M_PI);
4244
}); // end parallel_for
4345
}).wait(); // end q.submit
44-
}
46+
} // end fill_buffer_adv
4547

4648
// ==========================================
4749
// ==========================================
@@ -67,4 +69,33 @@ fill_buffer_conv1d(sycl::queue &q, span3d_t &data, span3d_t &warmup_data,
6769
[=](unsigned itm) { bias(itm) = 1.0; });
6870

6971
q.wait();
70-
}
72+
} // end fill_buffer_conv1d
73+
74+
// ==========================================
75+
// ==========================================
76+
template <typename Params>
77+
BkmaOptimParams create_optim_params(sycl::queue &q, const Params &params) {
78+
const auto n0 = params.n0;
79+
const auto n1 = params.n1;
80+
const auto n2 = params.n2;
81+
82+
WorkItemDispatch wi_dispatch;
83+
wi_dispatch.set_ideal_sizes(params.pref_wg_size, n0, n1, n2);
84+
auto max_elem_local_mem =
85+
q.get_device().get_info<sycl::info::device::local_mem_size>() /
86+
sizeof(real_t);
87+
wi_dispatch.adjust_sizes_mem_limit(max_elem_local_mem, n1);
88+
89+
WorkGroupDispatch wg_dispatch;
90+
wg_dispatch.set_num_work_groups(n0, n2, params.seq_size0, params.seq_size2,
91+
wi_dispatch.w0_, wi_dispatch.w2_);
92+
93+
/* TODO : here compute the number of batchs */
94+
return BkmaOptimParams{{1, n0, n0}, // BatchConfig1D dispatch_d0
95+
{1, n2, n2}, // BatchConfig1D dispatch_d2
96+
wi_dispatch.w0_, // size_t w0
97+
wi_dispatch.w1_, // size_t w1
98+
wi_dispatch.w2_, // size_t w2
99+
wg_dispatch, // WorkGroupDispatch wg_disp
100+
MemorySpace::Local};
101+
} //end create_optim_params

src/tools/validation.hpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
// ==========================================
77
// ==========================================
88
real_t
9-
validate_result_adv(sycl::queue &Q, real_t *fdist_dev, const ADVParams &params,
9+
validate_result_adv(sycl::queue &Q, span3d_t &data, const ADVParams &params,
1010
bool do_print = true) {
11+
std::cout << "\nRESULTS_VALIDATION:" << std::endl;
12+
1113
auto const dx = params.dx;
1214
auto const dvx = params.dvx;
1315
auto const dt = params.dt;
@@ -41,11 +43,9 @@ validate_result_adv(sycl::queue &Q, real_t *fdist_dev, const ADVParams &params,
4143

4244
cgh.parallel_for(
4345
r2d, errorl1_reduc, [=](auto itm, auto &errorl1_reduc) {
44-
span3d_t fdist(fdist_dev, params.n0, params.n1,
45-
params.n2);
4646
auto i1 = itm[1];
4747
auto i0 = itm[0];
48-
auto f = fdist(i0, i1, i2);
48+
auto f = data(i0, i1, i2);
4949

5050
real_t const x = minRealX + i1 * dx;
5151
real_t const v = minRealVx + i0 * dvx;
@@ -147,3 +147,16 @@ validate_conv1d(sycl::queue &Q, span3d_t &data, size_t nw) {
147147
else
148148
std::cout << "All values data[:,i1,:] are equal." << std::endl;
149149
} // end validate_conv1d
150+
151+
// ==========================================
152+
// ==========================================
153+
void print_perf(const double elapsed_seconds, const size_t n_cells){
154+
155+
std::cout << "PERF_DIAGS:" << std::endl;
156+
std::cout << "elapsed_time: " << elapsed_seconds << " s\n";
157+
158+
auto gcells = (n_cells / elapsed_seconds) / 1e9;
159+
std::cout << "upd_cells_per_sec: " << gcells << " Gcell/sec\n";
160+
std::cout << "estimated_throughput: " << gcells * sizeof(real_t) * 2
161+
<< " GB/s" << std::endl;
162+
}

src/types.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44

55
using real_t = double;
66

7+
[[nodiscard]] inline auto
8+
sycl_alloc(size_t size, sycl::queue &q) {
9+
return sycl::malloc_shared<real_t>(size, q);
10+
}
11+
712
using span0d_t =
813
std::experimental::mdspan<real_t, std::experimental::dextents<size_t, 0>,
914
std::experimental::layout_right>;

0 commit comments

Comments
 (0)