1- // REQUIRES: cuda
2- // RUN: % if any-device-is-cuda %{ %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_61 -o %t.out %}
3- // RUN: %if cuda %{ %{ run} %t.out %}
1+ // REQUIRES: cuda || hip || level_zero
2+ // RUN: %{build} % if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_61 %} -o %t.out
3+ // RUN: %{ run} %t.out
44
55#include < cassert>
66#include < numeric>
@@ -18,17 +18,8 @@ constexpr size_t N = 512;
1818
1919int main () {
2020
21- // Note that this code will largely be removed: it is temporary due to the
22- // temporary lack of multiple devices per sycl context in the Nvidia backend.
23- // A portable implementation, using a single gpu platform, should be possible
24- // once the Nvidia context issues are resolved.
25- // //////////////////////////////////////////////////////////////////////
26- std::vector<sycl::device> Devs;
27- for (const auto &plt : sycl::platform::get_platforms ()) {
21+ auto Devs = platform (gpu_selector_v).get_devices (info::device_type::gpu);
2822
29- if (plt.get_backend () == sycl::backend::ext_oneapi_cuda)
30- Devs.push_back (plt.get_devices ()[0 ]);
31- }
3223 if (Devs.size () < 2 ) {
3324 std::cout << " Cannot test P2P capabilities, at least two devices are "
3425 " required, exiting."
@@ -51,26 +42,26 @@ int main() {
5142 // Enables Devs[1] to access Devs[0] memory.
5243 Devs[1 ].ext_oneapi_enable_peer_access (Devs[0 ]);
5344
54- std::vector<double > input (N);
45+ std::vector<int > input (N);
5546 std::iota (input.begin (), input.end (), 0 );
5647
57- double h_sum = 0 .;
48+ int h_sum = 0 .;
5849 for (const auto &value : input) {
5950 h_sum += value;
6051 }
6152
62- double *d_sum = malloc_shared<double >(1 , Queues[0 ]);
63- double *d_in = malloc_device<double >(N, Queues[0 ]);
53+ int *d_sum = malloc_shared<int >(1 , Queues[0 ]);
54+ int *d_in = malloc_device<int >(N, Queues[0 ]);
6455
65- Queues[0 ].memcpy (d_in, &input[0 ], N * sizeof (double ));
56+ Queues[0 ].memcpy (d_in, &input[0 ], N * sizeof (int ));
6657 Queues[0 ].wait ();
6758
6859 range global_range{N};
6960
7061 *d_sum = 0 .;
7162 Queues[1 ].submit ([&](handler &h) {
7263 h.parallel_for <class peer_atomic >(global_range, [=](id<1 > i) {
73- sycl::atomic_ref<double , sycl::memory_order::relaxed,
64+ sycl::atomic_ref<int , sycl::memory_order::relaxed,
7465 sycl::memory_scope::system,
7566 access::address_space::global_space>(*d_sum) += d_in[i];
7667 });
0 commit comments