Skip to content

Commit 3467eeb

Browse files
Factored out common routines into cg_solver.hpp
The header and routines defined there are also used in standalone_cpp target built from cpp/main.cpp ``` (idp_2022) [08:27:56 ansatnuc04 onemkl_gemv]$ $(find . -name cmake-build)/standalone_cpp 1000 11 Solving 1000 by 1000 diagonal system with rank-11 perturbation. Converged in : 11 , 11 , 11 , 11 , 11 , 11 , Wall-clock cg_solve execution times: 683.416 , 408.391 , 411.849 , 412.661 , 412.317 , 412.658 , Redisual norm squared: 9.15541e-25 (idp_2022) [08:28:20 ansatnuc04 onemkl_gemv]$ python sycl_timing_solver.py 1000 11 Solving 1000 by 1000 diagonal linear system with rank 11 perturbation. Name Intel(R) UHD Graphics [0x9bca] Driver version 1.3.22992 Vendor Intel(R) Corporation Profile FULL_PROFILE Filter string level_zero:gpu:0 Using not in-order queue 0 (host_dt, device_dt)= (1156.8981241434813, 404.2941620000001) 1 (host_dt, device_dt)= (422.38221131265163, 404.5959500000001) 2 (host_dt, device_dt)= (423.1058154255152, 404.4037220000001) 3 (host_dt, device_dt)= (422.79740050435066, 403.78802800000005) 4 (host_dt, device_dt)= (424.4473725557327, 404.0881560000001) 5 (host_dt, device_dt)= (425.1241758465767, 404.55793600000004) Converged in: [11, 11, 11, 11, 11, 11] Python solution residual norm squared: 3.5289368114384933e-25 ``` README is also expanded.
1 parent c11811a commit 3467eeb

File tree

7 files changed

+465
-109
lines changed

7 files changed

+465
-109
lines changed

examples/pybind11/onemkl_gemv/CMakeLists.txt

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ pybind11_add_module(${py_module_name}
3838
sycl_gemm/_onemkl.cpp
3939
)
4040
target_include_directories(${py_module_name}
41-
PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR}
41+
PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
4242
)
4343
target_link_libraries(${py_module_name}
4444
PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
@@ -53,4 +53,18 @@ set_source_files_properties(${_sycl_gemm_sources}
5353
COMPILE_OPTIONS "-O3;-Wno-deprecated-declarations"
5454
)
5555

56+
add_executable(standalone_cpp
57+
EXCLUDE_FROM_ALL
58+
cpp/main.cpp
59+
)
60+
target_compile_options(standalone_cpp
61+
PRIVATE -O3 -Wno-deprecated-declarations
62+
)
63+
target_include_directories(standalone_cpp
64+
PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
65+
)
66+
target_link_libraries(standalone_cpp
67+
PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
68+
)
69+
5670
set(ignoreMe "${SKBUILD}")
Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,32 @@
11
Example of SYCL built pybind11 extension
22

3-
To build, use (assumes scikit-build and dpcpp) is installed
3+
To build, use (assumes scikit-build and dpcpp is installed):
44

55
```sh
6-
python setup.py develop -- -G "Ninja" -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icpx -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
6+
python setup.py develop -- -G "Ninja" \
7+
-DCMAKE_C_COMPILER:PATH=icx \
8+
-DCMAKE_CXX_COMPILER:PATH=icpx \
9+
-DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib \
10+
-DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib \
11+
-DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include \
12+
-DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
713
```
814

915
To run test suite
1016

1117
```sh
1218
python -m pytest tests
1319
```
20+
21+
To compare Python overhead,
22+
23+
```
24+
# build standad-alone executable
25+
cmake --build $(find . -name cmake-build) --target standalone_cpp
26+
# execute it
27+
$(find . -name cmake-build)/standalone_cpp 1000 11
28+
# launch Python computatin
29+
python sycl_timing_solver.py 1000 11
30+
```
31+
32+
Compare host times vs. C++ wall-clock times while making sure that the number of iterations is the same
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#include "cg_solver.hpp"
2+
#include <CL/sycl.hpp>
3+
#include <chrono>
4+
#include <iostream>
5+
#include <oneapi/mkl.hpp>
6+
7+
using T = double;
8+
9+
int main(int argc, char *argv[])
10+
{
11+
size_t n = 1000;
12+
size_t rank = 16;
13+
14+
if (argc > 1) {
15+
n = std::stoi(argv[1]);
16+
}
17+
18+
if (argc > 2) {
19+
rank = std::stoi(argv[2]);
20+
}
21+
22+
std::cout << "Solving " << n << " by " << n << " diagonal system with rank-"
23+
<< rank << " perturbation." << std::endl;
24+
25+
sycl::queue q;
26+
27+
// USM allocation for data needed by program
28+
size_t buf_size = n * n + rank * n + 2 * n;
29+
T *buf = sycl::malloc_device<T>(buf_size, q);
30+
sycl::event memset_ev = q.fill<T>(buf, T(0), buf_size);
31+
32+
T *Amat = buf;
33+
T *umat = buf + n * n;
34+
T *bvec = umat + rank * n;
35+
T *sol_vec = bvec + n;
36+
37+
sycl::event set_diag_ev = q.submit([&](sycl::handler &cgh) {
38+
cgh.depends_on({memset_ev});
39+
cgh.parallel_for({n}, [=](sycl::id<1> id) {
40+
auto i = id[0];
41+
Amat[i * (n + 1)] = T(1);
42+
});
43+
});
44+
45+
oneapi::mkl::rng::philox4x32x10 engine(q, 7777);
46+
oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>
47+
distr(0.0, 1.0);
48+
49+
// populate umat and bvec in one call
50+
sycl::event umat_rand_ev =
51+
oneapi::mkl::rng::generate(distr, engine, n * rank + n, umat);
52+
53+
sycl::event syrk_ev = oneapi::mkl::blas::row_major::syrk(
54+
q, oneapi::mkl::uplo::U, oneapi::mkl::transpose::N, n, rank, T(1), umat,
55+
rank, T(1), Amat, n, {umat_rand_ev, set_diag_ev});
56+
57+
// need to transpose
58+
sycl::event transpose_ev = q.submit([&](sycl::handler &cgh) {
59+
cgh.depends_on(syrk_ev);
60+
cgh.parallel_for({n * n}, [=](sycl::id<1> id) {
61+
size_t i = id[0];
62+
size_t i0 = i / n;
63+
size_t i1 = i - i0 * n;
64+
if (i0 > i1) {
65+
Amat[i] = Amat[i1 * n + i0];
66+
}
67+
});
68+
});
69+
70+
q.wait();
71+
72+
constexpr int reps = 6;
73+
74+
std::vector<double> time;
75+
std::vector<int> conv_iters;
76+
77+
time.reserve(reps);
78+
conv_iters.reserve(reps);
79+
for (int i = 0; i < reps; ++i) {
80+
auto start = std::chrono::high_resolution_clock::now();
81+
int conv_iter_count = cg_solver::cg_solve(q, n, Amat, bvec, sol_vec);
82+
auto end = std::chrono::high_resolution_clock::now();
83+
84+
time.push_back(
85+
std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
86+
.count() *
87+
1e-06);
88+
89+
conv_iters.push_back(conv_iter_count);
90+
}
91+
92+
std::cout << "Converged in : ";
93+
for (auto &el : conv_iters) {
94+
std::cout << el << " , ";
95+
}
96+
std::cout << std::endl;
97+
98+
std::cout << "Wall-clock cg_solve execution times: ";
99+
for (auto &el : time) {
100+
std::cout << el << " , ";
101+
}
102+
std::cout << std::endl;
103+
104+
T *Ax = sycl::malloc_device<T>(2 * n + 1, q);
105+
T *delta = Ax + n;
106+
107+
sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
108+
q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, sol_vec, 1, T(0), Ax,
109+
1);
110+
111+
sycl::event sub_ev = oneapi::mkl::vm::sub(q, n, Ax, bvec, delta, {gemv_ev},
112+
oneapi::mkl::vm::mode::ha);
113+
114+
T *n2 = delta + n;
115+
sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
116+
q, n, delta, 1, delta, 1, n2, {sub_ev});
117+
118+
T n2_host{};
119+
q.copy<T>(n2, &n2_host, 1, {dot_ev}).wait_and_throw();
120+
121+
std::cout << "Redisual norm squared: " << n2_host << std::endl;
122+
123+
q.wait_and_throw();
124+
sycl::free(Ax, q);
125+
sycl::free(buf, q);
126+
127+
return 0;
128+
}

examples/pybind11/onemkl_gemv/solve.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -154,36 +154,37 @@ def cg_solve(A, b):
154154
exec_queue, p, Ap, depends=[e_dot]
155155
)
156156
# x = x + alpha * p
157-
he1_axpby, e1_axpby = sycl_gemm.axpby_inplace(
157+
he1_x_update, e1_x_update = sycl_gemm.axpby_inplace(
158158
exec_queue, alpha, p, 1, x, depends=[e_p, e_x]
159159
)
160-
all_host_tasks.append(he1_axpby)
161-
e_x = e1_axpby
160+
all_host_tasks.append(he1_x_update)
161+
e_x = e1_x_update
162162

163163
# r = r - alpha * Ap
164-
he2_axpby, e2_axpby = sycl_gemm.axpby_inplace(
164+
he2_r_update, e2_r_update = sycl_gemm.axpby_inplace(
165165
exec_queue, -alpha, Ap, 1, r, depends=[e_p]
166166
)
167-
all_host_tasks.append(he2_axpby)
167+
all_host_tasks.append(he2_r_update)
168168

169169
# rsnew = dot(r, r)
170170
rsnew = sycl_gemm.norm_squared_blocking(
171-
exec_queue, r, depends=[e2_axpby]
171+
exec_queue, r, depends=[e2_r_update]
172172
)
173173
if rsnew < 1e-20:
174-
e1_axpby.wait()
174+
e1_x_update.wait()
175175
converged = i
176176
break
177177
beta = rsnew / rsold
178178

179179
# p = r + beta * p
180-
he3_axpby, e3_axpby = sycl_gemm.axpby_inplace(
181-
exec_queue, 1, r, beta, p, depends=[e1_axpby, e2_axpby]
180+
he3_p_update, e3_p_update = sycl_gemm.axpby_inplace(
181+
exec_queue, 1, r, beta, p, depends=[e2_r_update]
182182
)
183183

184184
rsold = rsnew
185-
all_host_tasks.append(he3_axpby)
186-
e_p = e3_axpby
185+
all_host_tasks.append(he3_p_update)
186+
e_p = e3_p_update
187+
e_x = e1_x_update
187188

188189
dpctl.SyclEvent.wait_for(all_host_tasks)
189190
return x, converged
@@ -229,6 +230,7 @@ def cg_solve_numpy(A, b):
229230
lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2))))
230231

231232
q = dpctl.SyclQueue(property="enable_profiling")
233+
q.print_device_info()
232234
A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q)
233235
dev = A.device
234236
b = dpt.asarray(bnp, dtype="d", usm_type="device", device=dev)

0 commit comments

Comments
 (0)