Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,16 @@ endif()
include(FetchContent)

if (ENABLE_ONEMKL_SYCL)
set(SPBLAS_GPU_BACKEND ON)
find_package(MKL REQUIRED)
target_link_libraries(spblas INTERFACE MKL::MKL_SYCL) # SYCL APIs
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ONEMKL_SYCL")

FetchContent_Declare(
sycl_thrust
GIT_REPOSITORY https://github.com/SparseBLAS/sycl-thrust.git
GIT_TAG main)
FetchContent_MakeAvailable(sycl_thrust)
endif()

if (ENABLE_ARMPL)
Expand Down
1 change: 0 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ if (NOT SPBLAS_GPU_BACKEND)
add_example(matrix_opt_example)
add_example(spmm_csc)
else()
find_package(rocthrust REQUIRED)
add_subdirectory(device)
endif()

Expand Down
8 changes: 7 additions & 1 deletion examples/device/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
function(add_device_example example_name)
add_executable(${example_name} ${example_name}.cpp)

if (ENABLE_ROCSPARSE)
find_package(rocthrust REQUIRED)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe still keep it in the root CMakeLists?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, fixed this.

set_source_files_properties(${example_name}.cpp PROPERTIES LANGUAGE HIP)
target_link_libraries(${example_name} rocthrust)
# elseif (ENABLE_CUSPARSE)
# cuSPARSE linking details will go here.
elseif (ENABLE_ONEMKL_SYCL)
target_link_libraries(${example_name} sycl_thrust)
else()
message(FATAL_ERROR "Device backend not found.")
endif()
add_executable(${example_name} ${example_name}.cpp)

target_link_libraries(${example_name} spblas fmt)
endfunction()

Expand Down
3 changes: 1 addition & 2 deletions examples/device/simple_spmv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ int main(int argc, char** argv) {
std::span<value_t> y_span(d_y.data().get(), m);

// y = A * x
spblas::spmv_state_t state;
spblas::multiply(state, a, x_span, y_span);
spblas::multiply(a, x_span, y_span);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where did the state go ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be resolved now---both rocSPARSE and cuSPARSE now take an optional operation_info_t object.


thrust::copy(d_y.begin(), d_y.end(), y.begin());

Expand Down
2 changes: 2 additions & 0 deletions include/spblas/vendor/onemkl_sycl/detail/detail.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#pragma once

#include "create_matrix_handle.hpp"
#include "execution_policy.hpp"
#include "get_matrix_handle.hpp"
#include "get_queue.hpp"
52 changes: 52 additions & 0 deletions include/spblas/vendor/onemkl_sycl/detail/execution_policy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#pragma once

#include <spblas/vendor/onemkl_sycl/detail/get_pointer_device.hpp>
#include <sycl/sycl.hpp>

namespace spblas {

namespace mkl {

class parallel_policy {
public:
parallel_policy() {}

template <typename T>
sycl::queue get_queue(T* ptr) const {
return spblas::__mkl::get_pointer_queue(ptr);
}
Comment on lines +15 to +17
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this fills the queue* into ptr ? or what is T used for ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See below


sycl::queue get_queue() const {
return sycl::queue(sycl::default_selector_v);
}
};

class device_policy {
public:
device_policy(const sycl::queue& queue) : queue_(queue) {}

sycl::queue& get_queue() {
return queue_;
}

const sycl::queue& get_queue() const {
return queue_;
}

sycl::device get_device() const {
return queue_.get_device();
}

sycl::context get_context() const {
return queue_.get_context();
}

private:
sycl::queue queue_;
};

inline parallel_policy par;
Copy link
Contributor

@spencerpatty spencerpatty May 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does it mean to have an inline declaration here ? we are defining a singleton of spblas::mkl::par but why inline, what does it do ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inline just means that there's no external linkage. Without inline, if you compiled two .o files that both use par, each would declare its own symbol for par, and you'd get linker errors. This allows the library to be header only.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aha, that makes sense. thanks for the explanation!


} // namespace mkl

} // namespace spblas
46 changes: 46 additions & 0 deletions include/spblas/vendor/onemkl_sycl/detail/get_pointer_device.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#pragma once

#include <sycl/sycl.hpp>
#include <vector>

namespace spblas {

namespace __mkl {

inline std::vector<sycl::context> global_contexts_;

template <typename T>
std::pair<sycl::device, sycl::context> get_pointer_device(T* ptr) {
if (global_contexts_.empty()) {
for (auto&& platform : sycl::platform::get_platforms()) {
sycl::context context(platform.get_devices());

global_contexts_.push_back(context);
}
}

for (auto&& context : global_contexts_) {
try {
sycl::device device = sycl::get_pointer_device(ptr, context);
return {device, context};
} catch (...) {
}
}

throw std::runtime_error(
"get_pointer_device: could not locate device corresponding to pointer");
}

template <typename T>
sycl::queue get_pointer_queue(T* ptr) {
try {
auto&& [device, context] = get_pointer_device(ptr);
return sycl::queue(context, device);
} catch (...) {
return sycl::queue(sycl::cpu_selector_v);
}
}
Comment on lines +34 to +42
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this one for, as well ? what is T ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This uses SYCL's runtime APIs to determine which device is associated with the memory referenced by the pointer. Then it returns a queue on that device.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aha, can we put a comment to describe exactly that -- what happens if the pointer is not associated with a context ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I'll add a comment. If the pointer is not associated with any context, we return a queue associated with the CPU device (see line 40).


} // namespace __mkl

} // namespace spblas
40 changes: 40 additions & 0 deletions include/spblas/vendor/onemkl_sycl/detail/get_queue.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#pragma once

#include <spblas/vendor/onemkl_sycl/detail/execution_policy.hpp>

namespace spblas {

namespace __mkl {

template <typename T>
sycl::queue get_queue(const spblas::mkl::parallel_policy& policy, T* ptr) {
return policy.get_queue(ptr);
}

template <typename T>
sycl::queue& get_queue(spblas::mkl::device_policy& policy, T* ptr) {
return policy.get_queue();
}

} // namespace __mkl

} // namespace spblas

#if __has_include(<thrust/execution_policy.h>)

#include <thrust/execution_policy.h>

namespace spblas {

namespace __mkl {

template <typename T>
sycl::queue& get_queue(thrust::execution_policy& policy, T* ptr) {
return policy.get_queue();
}

} // namespace __mkl

} // namespace spblas

#endif
17 changes: 14 additions & 3 deletions include/spblas/vendor/onemkl_sycl/spmv_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,20 @@

namespace spblas {

template <matrix A, vector X, vector Y>
template <typename ExecutionPolicy, matrix A, vector X, vector Y>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the execution policy should be optional. I will describe it more detail in the comments.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the execution policy is currently optional; for the MKL vendor backend we have overloads both with and without it.

requires((__detail::has_csr_base<A> || __detail::has_csc_base<A>) &&
__detail::has_contiguous_range_base<X> &&
__ranges::contiguous_range<Y>)
void multiply(A&& a, X&& x, Y&& y) {
void multiply(ExecutionPolicy&& policy, A&& a, X&& x, Y&& y) {
log_trace("");
auto x_base = __detail::get_ultimate_base(x);

auto alpha_optional = __detail::get_scaling_factor(a, x);
tensor_scalar_t<A> alpha = alpha_optional.value_or(1);

sycl::queue q(sycl::cpu_selector_v);
auto a_data = __detail::get_ultimate_base(a).values().data();

auto&& q = __mkl::get_queue(policy, a_data);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so do we always need to extract an array to put into get_queue() with the policy to get the queue ? I see that it is querying from the array what device/context are associated and then creating a queue from those ... under which circumstances will it get a common queue that already existed and use it ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't this end up creating multiple queues, so they are not linked for submission events ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to me, ideally we have a single queue/device/context that is being used per policy, right ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the user creates their own policy with device_policy, everything will be created on the same queue. If we wanted to use the same queue with par, we could create a global hash table of queues based keyed on device and context, then select the appropriate queue.

According to the SYCL spec, creating queues should be cheap, and de-allocating a queue should not force completion of associated SYCL events (although that's not always the case with Intel's runtime).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my question is about whether this is a good thing, having separate queues each time (even if they are inorder queues)

sycl::queue q1(dev, ctxt);
sycl::queue q2(dev, ctxt);

auto ev1 = q1.parallel_for(range1, kernel1);
auto ev2 = q2.parallel_for(range2, kernel2);

do we have any ordering between ev1 and ev2 ?

what if we put ev1 into ev2 dependency ?

auto ev1 = q1.parallel_for(range1, kernel1);
auto ev2 = q2.parallel_for(range2, kernel2, {ev1}); 

with inorder or out-of-order queues, they have no relation in the first case, and theoretically they should work in the second case with ordering, but are we providing a nice way to order these things ?

I suppose in our current case where we are synchronous in each function, we have no problems, but I was thinking it might be good to think forward to asynchronous case ... creating queues on the fly shouldn't be a big deal, but can we avoid it with you table suggestion ? and what about inorder queues ... ? the benefit there is the lack of need for events, but we would still need them with queues created on the fly, right ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently everything's synchronous. I think for the asynchronous API, there are essentially two choices: a stream-based approach, or an event-based approach. If it's a stream-based approach, the user is almost certainly going to have to create the stream, and it will have a dedicated in-order queue associated with it.

If it's an event-based approach, the only requirement to create dependencies between events is that they belong to the same context. In Intel's runtime at least, we have a guarantee that they will be in the same context, so we (or the user) can create dependencies between different events. If we don't want to depend on this (or want to avoid creating queues on-the-fly), the previously mentioned hash table approach works.

I think for more advanced behavior like asynchrony it's also reasonable to expect the user to do a little more work (e.g. creating their own execution policy).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree and see that we are on the same page now

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking we were already implementing the common stream-based approach here, but we are not. We are implementing the synchronous API design. How much extra work would it be to add the stream-based approach (while still sync-ing at end of each function call) ? Might be good to just put that in place right away while there are few places that have this mechanism added to it ...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If someone else wants to take a stab at it, I think that'd be fine, but I don't think we should delay merging this until we have asynchrony working, as there are a few details (mostly related to state and memory allocation) that we'll need to figure out.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed, we can return to asynchrony later on and sort out the state/mem alloc/ user experience for asynchrony at a future time


auto a_handle = __mkl::get_matrix_handle(q, a);
auto a_transpose = __mkl::get_transpose(a);
Expand All @@ -51,4 +53,13 @@ void multiply(A&& a, X&& x, Y&& y) {
}
}

template <matrix A, vector X, vector Y>
requires((__detail::has_csr_base<A> || __detail::has_csc_base<A>) &&
__detail::has_contiguous_range_base<X> &&
__ranges::contiguous_range<Y>)
void multiply(A&& a, X&& x, Y&& y) {
multiply(mkl::par, std::forward<A>(a), std::forward<X>(x),
std::forward<Y>(y));
}

} // namespace spblas
44 changes: 30 additions & 14 deletions test/gtest/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,23 +1,39 @@
enable_testing()

set(TEST_SOURCES)

if (NOT ENABLE_ROCSPARSE)
add_executable(
spblas-tests
spmv_test.cpp
spmm_test.cpp
spgemm_test.cpp
spgemm_csr_csc.cpp
add_test.cpp
transpose_test.cpp
triangular_solve_test.cpp
)
elseif(ENABLE_ROCSPARSE)
set_source_files_properties(rocsparse/spmv_test.cpp PROPERTIES LANGUAGE HIP)
add_executable(spblas-tests
rocsparse/spmv_test.cpp)
list(APPEND TEST_SOURCES
spmv_test.cpp
spmm_test.cpp
spgemm_test.cpp
spgemm_csr_csc.cpp
add_test.cpp
transpose_test.cpp
triangular_solve_test.cpp)
endif()

if (SPBLAS_GPU_BACKEND)
list(APPEND TEST_SOURCES
thrust/spmv_test.cpp)

if (ENABLE_ROCSPARSE)
set_source_files_properties(${TEST_SOURCES} PROPERTIES LANGUAGE HIP)
endif()
endif()

add_executable(spblas-tests ${TEST_SOURCES})

target_link_libraries(spblas-tests spblas fmt GTest::gtest_main)

if (ENABLE_ROCSPARSE)
find_package(rocthrust REQUIRED)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we had some challenges in previous experience on repeatedly finding the same package in different folders. I will suggest we just find the package once in the root CMakeLists and use it in the subfolders unless there is another concern.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, fixed! (See above.)

target_link_libraries(spblas-tests rocthrust)
endif()

if (ENABLE_ONEMKL_SYCL)
target_link_libraries(spblas-tests sycl_thrust)
endif()

include(GoogleTest)
gtest_discover_tests(spblas-tests)
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

#include "../util.hpp"
#include <spblas/spblas.hpp>

Expand All @@ -9,7 +8,7 @@ using value_t = float;
using index_t = spblas::index_t;
using offset_t = spblas::offset_t;

TEST(CsrView, SpMV) {
TEST(thrust_CsrView, SpMV) {
for (auto&& [num_rows, num_cols, nnz] : util::dims) {
auto [values, rowptr, colind, shape, _] =
spblas::generate_csr<value_t, index_t, offset_t>(num_rows, num_cols,
Expand All @@ -32,8 +31,7 @@ TEST(CsrView, SpMV) {
std::span<value_t> b_span(d_b.data().get(), num_cols);
std::span<value_t> c_span(d_c.data().get(), num_rows);

spblas::spmv_state_t state;
spblas::multiply(state, a, b_span, c_span);
spblas::multiply(a, b_span, c_span);

thrust::copy(d_c.begin(), d_c.end(), c.begin());

Expand All @@ -53,7 +51,7 @@ TEST(CsrView, SpMV) {
}
}

TEST(CsrView, SpMV_Ascaled) {
TEST(thrust_CsrView, SpMV_Ascaled) {
for (auto&& [num_rows, num_cols, nnz] :
{std::tuple(1000, 100, 100), std::tuple(100, 1000, 10000),
std::tuple(40, 40, 1000)}) {
Expand All @@ -79,8 +77,7 @@ TEST(CsrView, SpMV_Ascaled) {
std::span<value_t> b_span(d_b.data().get(), num_cols);
std::span<value_t> c_span(d_c.data().get(), num_rows);

spblas::spmv_state_t state;
spblas::multiply(state, spblas::scaled(alpha, a), b_span, c_span);
spblas::multiply(spblas::scaled(alpha, a), b_span, c_span);

thrust::copy(d_c.begin(), d_c.end(), c.begin());

Expand All @@ -101,7 +98,7 @@ TEST(CsrView, SpMV_Ascaled) {
}
}

TEST(CsrView, SpMV_BScaled) {
TEST(thrust_CsrView, SpMV_BScaled) {
for (auto&& [num_rows, num_cols, nnz] :
{std::tuple(1000, 100, 100), std::tuple(100, 1000, 10000),
std::tuple(40, 40, 1000)}) {
Expand All @@ -127,8 +124,7 @@ TEST(CsrView, SpMV_BScaled) {
std::span<value_t> b_span(d_b.data().get(), num_cols);
std::span<value_t> c_span(d_c.data().get(), num_rows);

spblas::spmv_state_t state;
spblas::multiply(state, a, spblas::scaled(alpha, b_span), c_span);
spblas::multiply(a, spblas::scaled(alpha, b_span), c_span);

thrust::copy(d_c.begin(), d_c.end(), c.begin());

Expand Down