IntelPython
diff --git a/‎examples/pybind11/onemkl_gemv/CMakeLists.txt
Lines changed: 15 additions & 1 deletion b/‎examples/pybind11/onemkl_gemv/CMakeLists.txt
Lines changed: 15 additions & 1 deletion
diff --git a/‎examples/pybind11/onemkl_gemv/README.md
Lines changed: 21 additions & 2 deletions b/‎examples/pybind11/onemkl_gemv/README.md
Lines changed: 21 additions & 2 deletions
diff --git a/‎examples/pybind11/onemkl_gemv/cpp/main.cpp
Lines changed: 128 additions & 0 deletions b/‎examples/pybind11/onemkl_gemv/cpp/main.cpp
Lines changed: 128 additions & 0 deletions
diff --git a/‎examples/pybind11/onemkl_gemv/solve.py
Lines changed: 13 additions & 11 deletions b/‎examples/pybind11/onemkl_gemv/solve.py
Lines changed: 13 additions & 11 deletions
@@ -38,7 +38,7 @@ pybind11_add_module(${py_module_name}
     sycl_gemm/_onemkl.cpp
 )
 target_include_directories(${py_module_name}
-    PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR}
+    PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
 )
 target_link_libraries(${py_module_name}
     PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
@@ -53,4 +53,18 @@ set_source_files_properties(${_sycl_gemm_sources}
   COMPILE_OPTIONS "-O3;-Wno-deprecated-declarations"
 )
 
+add_executable(standalone_cpp
+  EXCLUDE_FROM_ALL
+  cpp/main.cpp
+)
+target_compile_options(standalone_cpp
+  PRIVATE -O3 -Wno-deprecated-declarations
+)
+target_include_directories(standalone_cpp
+  PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
+  )
+target_link_libraries(standalone_cpp
+    PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
+)
+
 set(ignoreMe "${SKBUILD}")
@@ -1,13 +1,32 @@
 Example of SYCL built pybind11 extension
 
-To build, use (assumes scikit-build and dpcpp) is installed
+To build, use (assumes scikit-build and dpcpp is installed):
 
 ```sh
-python setup.py develop -- -G "Ninja" -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icpx -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
+python setup.py develop -- -G "Ninja" \
+     -DCMAKE_C_COMPILER:PATH=icx \
+     -DCMAKE_CXX_COMPILER:PATH=icpx \
+     -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib \
+     -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib \
+     -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include \
+     -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
 ```
 
 To run test suite
 
 ```sh
 python -m pytest tests
 ```
+
+To compare Python overhead,
+
+```
+# build standad-alone executable
+cmake --build $(find . -name cmake-build) --target standalone_cpp
+# execute it
+$(find . -name cmake-build)/standalone_cpp 1000 11
+# launch Python computatin
+python sycl_timing_solver.py 1000 11
+```
+
+Compare host times vs. C++ wall-clock times while making sure that the number of iterations is the same
@@ -0,0 +1,128 @@
+#include "cg_solver.hpp"
+#include <CL/sycl.hpp>
+#include <chrono>
+#include <iostream>
+#include <oneapi/mkl.hpp>
+
+using T = double;
+
+int main(int argc, char *argv[])
+{
+    size_t n = 1000;
+    size_t rank = 16;
+
+    if (argc > 1) {
+        n = std::stoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        rank = std::stoi(argv[2]);
+    }
+
+    std::cout << "Solving " << n << " by " << n << " diagonal system with rank-"
+              << rank << " perturbation." << std::endl;
+
+    sycl::queue q;
+
+    // USM allocation for data needed by program
+    size_t buf_size = n * n + rank * n + 2 * n;
+    T *buf = sycl::malloc_device<T>(buf_size, q);
+    sycl::event memset_ev = q.fill<T>(buf, T(0), buf_size);
+
+    T *Amat = buf;
+    T *umat = buf + n * n;
+    T *bvec = umat + rank * n;
+    T *sol_vec = bvec + n;
+
+    sycl::event set_diag_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on({memset_ev});
+        cgh.parallel_for({n}, [=](sycl::id<1> id) {
+            auto i = id[0];
+            Amat[i * (n + 1)] = T(1);
+        });
+    });
+
+    oneapi::mkl::rng::philox4x32x10 engine(q, 7777);
+    oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>
+        distr(0.0, 1.0);
+
+    // populate umat and bvec in one call
+    sycl::event umat_rand_ev =
+        oneapi::mkl::rng::generate(distr, engine, n * rank + n, umat);
+
+    sycl::event syrk_ev = oneapi::mkl::blas::row_major::syrk(
+        q, oneapi::mkl::uplo::U, oneapi::mkl::transpose::N, n, rank, T(1), umat,
+        rank, T(1), Amat, n, {umat_rand_ev, set_diag_ev});
+
+    // need to transpose
+    sycl::event transpose_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(syrk_ev);
+        cgh.parallel_for({n * n}, [=](sycl::id<1> id) {
+            size_t i = id[0];
+            size_t i0 = i / n;
+            size_t i1 = i - i0 * n;
+            if (i0 > i1) {
+                Amat[i] = Amat[i1 * n + i0];
+            }
+        });
+    });
+
+    q.wait();
+
+    constexpr int reps = 6;
+
+    std::vector<double> time;
+    std::vector<int> conv_iters;
+
+    time.reserve(reps);
+    conv_iters.reserve(reps);
+    for (int i = 0; i < reps; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        int conv_iter_count = cg_solver::cg_solve(q, n, Amat, bvec, sol_vec);
+        auto end = std::chrono::high_resolution_clock::now();
+
+        time.push_back(
+            std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
+                .count() *
+            1e-06);
+
+        conv_iters.push_back(conv_iter_count);
+    }
+
+    std::cout << "Converged in : ";
+    for (auto &el : conv_iters) {
+        std::cout << el << " , ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "Wall-clock cg_solve execution times: ";
+    for (auto &el : time) {
+        std::cout << el << " , ";
+    }
+    std::cout << std::endl;
+
+    T *Ax = sycl::malloc_device<T>(2 * n + 1, q);
+    T *delta = Ax + n;
+
+    sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
+        q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, sol_vec, 1, T(0), Ax,
+        1);
+
+    sycl::event sub_ev = oneapi::mkl::vm::sub(q, n, Ax, bvec, delta, {gemv_ev},
+                                              oneapi::mkl::vm::mode::ha);
+
+    T *n2 = delta + n;
+    sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
+        q, n, delta, 1, delta, 1, n2, {sub_ev});
+
+    T n2_host{};
+    q.copy<T>(n2, &n2_host, 1, {dot_ev}).wait_and_throw();
+
+    std::cout << "Redisual norm squared: " << n2_host << std::endl;
+
+    q.wait_and_throw();
+    sycl::free(Ax, q);
+    sycl::free(buf, q);
+
+    return 0;
+}
@@ -154,36 +154,37 @@ def cg_solve(A, b):
             exec_queue, p, Ap, depends=[e_dot]
         )
         # x = x + alpha * p
-        he1_axpby, e1_axpby = sycl_gemm.axpby_inplace(
+        he1_x_update, e1_x_update = sycl_gemm.axpby_inplace(
             exec_queue, alpha, p, 1, x, depends=[e_p, e_x]
         )
-        all_host_tasks.append(he1_axpby)
-        e_x = e1_axpby
+        all_host_tasks.append(he1_x_update)
+        e_x = e1_x_update
 
         # r = r - alpha * Ap
-        he2_axpby, e2_axpby = sycl_gemm.axpby_inplace(
+        he2_r_update, e2_r_update = sycl_gemm.axpby_inplace(
             exec_queue, -alpha, Ap, 1, r, depends=[e_p]
         )
-        all_host_tasks.append(he2_axpby)
+        all_host_tasks.append(he2_r_update)
 
         # rsnew = dot(r, r)
         rsnew = sycl_gemm.norm_squared_blocking(
-            exec_queue, r, depends=[e2_axpby]
+            exec_queue, r, depends=[e2_r_update]
         )
         if rsnew < 1e-20:
-            e1_axpby.wait()
+            e1_x_update.wait()
             converged = i
             break
         beta = rsnew / rsold
 
         # p = r + beta * p
-        he3_axpby, e3_axpby = sycl_gemm.axpby_inplace(
-            exec_queue, 1, r, beta, p, depends=[e1_axpby, e2_axpby]
+        he3_p_update, e3_p_update = sycl_gemm.axpby_inplace(
+            exec_queue, 1, r, beta, p, depends=[e2_r_update]
         )
 
         rsold = rsnew
-        all_host_tasks.append(he3_axpby)
-        e_p = e3_axpby
+        all_host_tasks.append(he3_p_update)
+        e_p = e3_p_update
+        e_x = e1_x_update
 
     dpctl.SyclEvent.wait_for(all_host_tasks)
     return x, converged
@@ -229,6 +230,7 @@ def cg_solve_numpy(A, b):
     lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2))))
 
     q = dpctl.SyclQueue(property="enable_profiling")
+    q.print_device_info()
     A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q)
     dev = A.device
     b = dpt.asarray(bnp, dtype="d", usm_type="device", device=dev)