-
Notifications
You must be signed in to change notification settings - Fork 8
Build with sycl_thrust for device examples if building oneMKL.
#53
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
8e272ad
e79ee7f
a841a09
1f4679e
53adaa6
0b9af32
61ad2ac
053369b
e00ccc2
56534e8
4e7c172
16dc7d1
2adca57
e0ed0be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,8 +56,7 @@ int main(int argc, char** argv) { | |
| std::span<value_t> y_span(d_y.data().get(), m); | ||
|
|
||
| // y = A * x | ||
| spblas::spmv_state_t state; | ||
| spblas::multiply(state, a, x_span, y_span); | ||
| spblas::multiply(a, x_span, y_span); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where did the state go ?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be resolved now---both rocSPARSE and cuSPARSE now take an optional |
||
|
|
||
| thrust::copy(d_y.begin(), d_y.end(), y.begin()); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,6 @@ | ||
| #pragma once | ||
|
|
||
| #include "create_matrix_handle.hpp" | ||
| #include "execution_policy.hpp" | ||
| #include "get_matrix_handle.hpp" | ||
| #include "get_queue.hpp" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| #pragma once | ||
|
|
||
| #include <spblas/vendor/onemkl_sycl/detail/get_pointer_device.hpp> | ||
| #include <sycl/sycl.hpp> | ||
|
|
||
| namespace spblas { | ||
|
|
||
| namespace mkl { | ||
|
|
||
| class parallel_policy { | ||
| public: | ||
| parallel_policy() {} | ||
|
|
||
| template <typename T> | ||
| sycl::queue get_queue(T* ptr) const { | ||
| return spblas::__mkl::get_pointer_queue(ptr); | ||
| } | ||
|
Comment on lines
+15
to
+17
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so this fills the queue* into ptr ? or what is T used for ?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See below |
||
|
|
||
| sycl::queue get_queue() const { | ||
| return sycl::queue(sycl::default_selector_v); | ||
| } | ||
| }; | ||
|
|
||
| class device_policy { | ||
| public: | ||
| device_policy(const sycl::queue& queue) : queue_(queue) {} | ||
|
|
||
| sycl::queue& get_queue() { | ||
| return queue_; | ||
| } | ||
|
|
||
| const sycl::queue& get_queue() const { | ||
| return queue_; | ||
| } | ||
|
|
||
| sycl::device get_device() const { | ||
| return queue_.get_device(); | ||
| } | ||
|
|
||
| sycl::context get_context() const { | ||
| return queue_.get_context(); | ||
| } | ||
|
|
||
| private: | ||
| sycl::queue queue_; | ||
| }; | ||
|
|
||
| inline parallel_policy par; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does it mean to have an inline declaration here ? we are defining a singleton of
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. aha, that makes sense. thanks for the explanation! |
||
|
|
||
| } // namespace mkl | ||
|
|
||
| } // namespace spblas | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| #pragma once | ||
|
|
||
| #include <sycl/sycl.hpp> | ||
| #include <vector> | ||
|
|
||
| namespace spblas { | ||
|
|
||
| namespace __mkl { | ||
|
|
||
| inline std::vector<sycl::context> global_contexts_; | ||
|
|
||
| template <typename T> | ||
| std::pair<sycl::device, sycl::context> get_pointer_device(T* ptr) { | ||
| if (global_contexts_.empty()) { | ||
| for (auto&& platform : sycl::platform::get_platforms()) { | ||
| sycl::context context(platform.get_devices()); | ||
|
|
||
| global_contexts_.push_back(context); | ||
| } | ||
| } | ||
|
|
||
| for (auto&& context : global_contexts_) { | ||
| try { | ||
| sycl::device device = sycl::get_pointer_device(ptr, context); | ||
| return {device, context}; | ||
| } catch (...) { | ||
| } | ||
| } | ||
|
|
||
| throw std::runtime_error( | ||
| "get_pointer_device: could not locate device corresponding to pointer"); | ||
| } | ||
|
|
||
| template <typename T> | ||
| sycl::queue get_pointer_queue(T* ptr) { | ||
| try { | ||
| auto&& [device, context] = get_pointer_device(ptr); | ||
| return sycl::queue(context, device); | ||
| } catch (...) { | ||
| return sycl::queue(sycl::cpu_selector_v); | ||
| } | ||
| } | ||
|
Comment on lines
+34
to
+42
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is this one for, as well ? what is T ?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This uses SYCL's runtime APIs to determine which device is associated with the memory referenced by the pointer. Then it returns a queue on that device.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. aha, can we put a comment to describe exactly that -- what happens if the pointer is not associated with a context ?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I'll add a comment. If the pointer is not associated with any context, we return a queue associated with the CPU device (see line 40). |
||
|
|
||
| } // namespace __mkl | ||
|
|
||
| } // namespace spblas | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| #pragma once | ||
|
|
||
| #include <spblas/vendor/onemkl_sycl/detail/execution_policy.hpp> | ||
|
|
||
| namespace spblas { | ||
|
|
||
| namespace __mkl { | ||
|
|
||
| template <typename T> | ||
| sycl::queue get_queue(const spblas::mkl::parallel_policy& policy, T* ptr) { | ||
| return policy.get_queue(ptr); | ||
| } | ||
|
|
||
| template <typename T> | ||
| sycl::queue& get_queue(spblas::mkl::device_policy& policy, T* ptr) { | ||
| return policy.get_queue(); | ||
| } | ||
|
|
||
| } // namespace __mkl | ||
|
|
||
| } // namespace spblas | ||
|
|
||
| #if __has_include(<thrust/execution_policy.h>) | ||
|
|
||
| #include <thrust/execution_policy.h> | ||
|
|
||
| namespace spblas { | ||
|
|
||
| namespace __mkl { | ||
|
|
||
| template <typename T> | ||
| sycl::queue& get_queue(thrust::execution_policy& policy, T* ptr) { | ||
| return policy.get_queue(); | ||
| } | ||
|
|
||
| } // namespace __mkl | ||
|
|
||
| } // namespace spblas | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,18 +26,20 @@ | |
|
|
||
| namespace spblas { | ||
|
|
||
| template <matrix A, vector X, vector Y> | ||
| template <typename ExecutionPolicy, matrix A, vector X, vector Y> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the execution policy should be optional. I will describe it more detail in the comments.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the execution policy is currently optional; for the MKL vendor backend we have overloads both with and without it. |
||
| requires((__detail::has_csr_base<A> || __detail::has_csc_base<A>) && | ||
| __detail::has_contiguous_range_base<X> && | ||
| __ranges::contiguous_range<Y>) | ||
| void multiply(A&& a, X&& x, Y&& y) { | ||
| void multiply(ExecutionPolicy&& policy, A&& a, X&& x, Y&& y) { | ||
| log_trace(""); | ||
| auto x_base = __detail::get_ultimate_base(x); | ||
|
|
||
| auto alpha_optional = __detail::get_scaling_factor(a, x); | ||
| tensor_scalar_t<A> alpha = alpha_optional.value_or(1); | ||
|
|
||
| sycl::queue q(sycl::cpu_selector_v); | ||
| auto a_data = __detail::get_ultimate_base(a).values().data(); | ||
|
|
||
| auto&& q = __mkl::get_queue(policy, a_data); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so do we always need to extract an array to put into get_queue() with the policy to get the queue ? I see that it is querying from the array what device/context are associated and then creating a queue from those ... under which circumstances will it get a common queue that already existed and use it ?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't this end up creating multiple queues, so they are not linked for submission events ?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to me, ideally we have a single queue/device/context that is being used per policy, right ?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the user creates their own policy with According to the SYCL spec, creating queues should be cheap, and de-allocating a queue should not force completion of associated SYCL events (although that's not always the case with Intel's runtime).
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. my question is about whether this is a good thing, having separate queues each time (even if they are inorder queues) do we have any ordering between ev1 and ev2 ? what if we put ev1 into ev2 dependency ? with inorder or out-of-order queues, they have no relation in the first case, and theoretically they should work in the second case with ordering, but are we providing a nice way to order these things ? I suppose in our current case where we are synchronous in each function, we have no problems, but I was thinking it might be good to think forward to asynchronous case ... creating queues on the fly shouldn't be a big deal, but can we avoid it with you table suggestion ? and what about inorder queues ... ? the benefit there is the lack of need for events, but we would still need them with queues created on the fly, right ?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently everything's synchronous. I think for the asynchronous API, there are essentially two choices: a stream-based approach, or an event-based approach. If it's a stream-based approach, the user is almost certainly going to have to create the stream, and it will have a dedicated in-order queue associated with it. If it's an event-based approach, the only requirement to create dependencies between events is that they belong to the same context. In Intel's runtime at least, we have a guarantee that they will be in the same context, so we (or the user) can create dependencies between different events. If we don't want to depend on this (or want to avoid creating queues on-the-fly), the previously mentioned hash table approach works. I think for more advanced behavior like asynchrony it's also reasonable to expect the user to do a little more work (e.g. creating their own execution policy).
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree and see that we are on the same page now
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking we were already implementing the common stream-based approach here, but we are not. We are implementing the synchronous API design. How much extra work would it be to add the stream-based approach (while still sync-ing at end of each function call) ? Might be good to just put that in place right away while there are few places that have this mechanism added to it ...
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If someone else wants to take a stab at it, I think that'd be fine, but I don't think we should delay merging this until we have asynchrony working, as there are a few details (mostly related to state and memory allocation) that we'll need to figure out.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed, we can return to asynchrony later on and sort out the state/mem alloc/ user experience for asynchrony at a future time |
||
|
|
||
| auto a_handle = __mkl::get_matrix_handle(q, a); | ||
| auto a_transpose = __mkl::get_transpose(a); | ||
|
|
@@ -51,4 +53,13 @@ void multiply(A&& a, X&& x, Y&& y) { | |
| } | ||
| } | ||
|
|
||
| template <matrix A, vector X, vector Y> | ||
| requires((__detail::has_csr_base<A> || __detail::has_csc_base<A>) && | ||
| __detail::has_contiguous_range_base<X> && | ||
| __ranges::contiguous_range<Y>) | ||
| void multiply(A&& a, X&& x, Y&& y) { | ||
| multiply(mkl::par, std::forward<A>(a), std::forward<X>(x), | ||
| std::forward<Y>(y)); | ||
| } | ||
|
|
||
| } // namespace spblas | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,23 +1,39 @@ | ||
| enable_testing() | ||
|
|
||
| set(TEST_SOURCES) | ||
|
|
||
| if (NOT ENABLE_ROCSPARSE) | ||
| add_executable( | ||
| spblas-tests | ||
| spmv_test.cpp | ||
| spmm_test.cpp | ||
| spgemm_test.cpp | ||
| spgemm_csr_csc.cpp | ||
| add_test.cpp | ||
| transpose_test.cpp | ||
| triangular_solve_test.cpp | ||
| ) | ||
| elseif(ENABLE_ROCSPARSE) | ||
| set_source_files_properties(rocsparse/spmv_test.cpp PROPERTIES LANGUAGE HIP) | ||
| add_executable(spblas-tests | ||
| rocsparse/spmv_test.cpp) | ||
| list(APPEND TEST_SOURCES | ||
| spmv_test.cpp | ||
| spmm_test.cpp | ||
| spgemm_test.cpp | ||
| spgemm_csr_csc.cpp | ||
| add_test.cpp | ||
| transpose_test.cpp | ||
| triangular_solve_test.cpp) | ||
| endif() | ||
|
|
||
| if (SPBLAS_GPU_BACKEND) | ||
| list(APPEND TEST_SOURCES | ||
| thrust/spmv_test.cpp) | ||
|
|
||
| if (ENABLE_ROCSPARSE) | ||
| set_source_files_properties(${TEST_SOURCES} PROPERTIES LANGUAGE HIP) | ||
| endif() | ||
| endif() | ||
|
|
||
| add_executable(spblas-tests ${TEST_SOURCES}) | ||
|
|
||
| target_link_libraries(spblas-tests spblas fmt GTest::gtest_main) | ||
|
|
||
| if (ENABLE_ROCSPARSE) | ||
| find_package(rocthrust REQUIRED) | ||
|
||
| target_link_libraries(spblas-tests rocthrust) | ||
| endif() | ||
|
|
||
| if (ENABLE_ONEMKL_SYCL) | ||
| target_link_libraries(spblas-tests sycl_thrust) | ||
| endif() | ||
|
|
||
| include(GoogleTest) | ||
| gtest_discover_tests(spblas-tests) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe still keep it in the root CMakeLists?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, fixed this.