diff --git a/sycl/test-e2e/Basic/buffer/reinterpret.cpp b/sycl/test-e2e/Basic/buffer/reinterpret.cpp index 691af19ff34f8..7efa1f61030ea 100644 --- a/sycl/test-e2e/Basic/buffer/reinterpret.cpp +++ b/sycl/test-e2e/Basic/buffer/reinterpret.cpp @@ -2,6 +2,7 @@ // RUN: %{run} %t.out // // XFAIL: level_zero&&gpu +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14430 //==---------- reinterpret.cpp --- SYCL buffer reinterpret basic test ------==// // diff --git a/sycl/test-e2e/Basic/queue/queue.cpp b/sycl/test-e2e/Basic/queue/queue.cpp index bde85310cc06c..20697acfbee89 100644 --- a/sycl/test-e2e/Basic/queue/queue.cpp +++ b/sycl/test-e2e/Basic/queue/queue.cpp @@ -1,7 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// -// XFAIL: hip_nvidia + //==--------------- queue.cpp - SYCL queue test ----------------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test-e2e/Basic/queue/release.cpp b/sycl/test-e2e/Basic/queue/release.cpp index d241b742a0216..13ee5d6ee22bf 100644 --- a/sycl/test-e2e/Basic/queue/release.cpp +++ b/sycl/test-e2e/Basic/queue/release.cpp @@ -1,7 +1,5 @@ // RUN: %{build} -o %t.out // RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s %if !windows %{--check-prefixes=CHECK-RELEASE%} -// -// XFAIL: hip_nvidia #include int main() { diff --git a/sycl/test-e2e/Basic/span.cpp b/sycl/test-e2e/Basic/span.cpp index 17c84359e8140..2a112ba4740d5 100644 --- a/sycl/test-e2e/Basic/span.cpp +++ b/sycl/test-e2e/Basic/span.cpp @@ -1,8 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// Fails to release USM pointer on HIP for NVIDIA -// XFAIL: hip_nvidia // REQUIRES: aspect-usm_shared_allocations #include diff --git a/sycl/test-e2e/Basic/stream/auto_flush.cpp b/sycl/test-e2e/Basic/stream/auto_flush.cpp index d25cf943a6f3a..f346eca5b72cf 100644 --- a/sycl/test-e2e/Basic/stream/auto_flush.cpp +++ b/sycl/test-e2e/Basic/stream/auto_flush.cpp @@ -1,7 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out %if !gpu || linux %{ | FileCheck %s %} -// -// XFAIL: hip_nvidia + //==-------------- copy.cpp - SYCL stream obect auto flushing test ---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp b/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp index 98f36dd106bae..ab9059ce98976 100644 --- a/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp +++ b/sycl/test-e2e/DeprecatedFeatures/queue_old_interop.cpp @@ -1,9 +1,6 @@ // RUN: %{build} -D__SYCL_INTERNAL_API -o %t.out // RUN: %{run-unfiltered-devices} %t.out -// -// hip_nvidia has problems constructing queues due to `No device of requested -// type available`. -// XFAIL: hip_nvidia + //==-------- queue_old_interop.cpp - SYCL queue OpenCL interop test --------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp b/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp index 8b32a880a6470..15677a64ea5e5 100644 --- a/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp +++ b/sycl/test-e2e/DeviceCodeSplit/split-per-kernel.cpp @@ -1,8 +1,6 @@ // RUN: %{build} -Wno-error=unused-command-line-argument -fsycl-device-code-split=per_kernel -o %t.out \ // RUN: -fsycl-dead-args-optimization // RUN: %{run} %t.out -// -// XFAIL: hip_nvidia #include #include diff --git a/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp b/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp index 4f9063c18e784..29c6102f71284 100644 --- a/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp +++ b/sycl/test-e2e/DeviceCodeSplit/split-per-source-main.cpp @@ -1,8 +1,6 @@ // RUN: %{build} -Wno-error=unused-command-line-argument -fsycl-device-code-split=per_source -I %S/Inputs -o %t.out %S/Inputs/split-per-source-second-file.cpp \ // RUN: -fsycl-dead-args-optimization // RUN: %{run} %t.out -// -// XFAIL: hip_nvidia #include "Inputs/split-per-source.h" diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index fd467f14800e8..fc76171e36fb7 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -1,5 +1,7 @@ // Fails with opencl non-cpu, enable when fixed. // XFAIL: (opencl && !cpu && !accelerator) +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14641 + // RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} // RUN: %{run} %t.out diff --git a/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp b/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp index 7356f94a69ff8..da0f3881ffc59 100644 --- a/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp +++ b/sycl/test-e2e/GroupLocalMemory/group_local_memory.cpp @@ -1,7 +1,5 @@ // RUN: %{build} -Wno-error=deprecated-declarations -o %t.out // RUN: %{run} %t.out -// -// XFAIL: hip_nvidia #include diff --git a/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp b/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp index 1091cf90b6dca..b0b3a0e8a8309 100644 --- a/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp +++ b/sycl/test-e2e/GroupLocalMemory/no_early_opt.cpp @@ -1,7 +1,5 @@ // RUN: %{build} -Wno-error=deprecated-declarations -fno-sycl-early-optimizations -o %t.out // RUN: %{run} %t.out -// -// XFAIL: hip_nvidia // The test checks that multiple calls to the same template instantiation of a // group local memory function result in separate allocations, even with device diff --git a/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp b/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp index 74b82dfc85ddc..fafd570f4bab2 100644 --- a/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp +++ b/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp @@ -1,6 +1,7 @@ // TODO: Passing/returning structures via invoke_simd() API is not implemented // in GPU driver yet. Enable the test when GPU RT supports it. // XFAIL: gpu && run-mode +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14543 // // RUN: %{build} -DIMPL_SUBGROUP -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out // RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out diff --git a/sycl/test-e2e/InvokeSimd/Feature/invoke_simd_struct.cpp b/sycl/test-e2e/InvokeSimd/Feature/invoke_simd_struct.cpp index 014027460301c..961a91b83b4f7 100644 --- a/sycl/test-e2e/InvokeSimd/Feature/invoke_simd_struct.cpp +++ b/sycl/test-e2e/InvokeSimd/Feature/invoke_simd_struct.cpp @@ -1,6 +1,7 @@ // TODO: Passing/returning structures via invoke_simd() API is not implemented // in GPU driver yet. Enable the test when GPU RT supports it. // XFAIL: gpu, run-mode +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14543 // // RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out // RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out diff --git a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp deleted file mode 100644 index 34fe12ebb70e9..0000000000000 --- a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// TODO: enable when Jira ticket resolved -// XFAIL: * -// -// Check that full compilation works: -// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tuple.cpp -o %t.out -// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out -// -// VISALTO enable run -// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out - -/* - * This tests is the same as InvokeSimd/spec/tuple.cpp, but compiles without - * optional subgroup attribute specified and intended to check that compiler is - * able to choose subgroup size correctly. - */ diff --git a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp deleted file mode 100644 index 90c3468b4151f..0000000000000 --- a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// TODO: enable when Jira ticket resolved -// XFAIL: * -// -// Check that full compilation works: -// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tuple_return.cpp -o %t.out -// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out -// -// VISALTO enable run -// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out - -/* - * This tests is the same as InvokeSimd/spec/tuple_return.cpp, but compiles - * without optional subgroup attribute specified and intended to check that - * compiler is able to choose subgroup size correctly. - */ diff --git a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp b/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp deleted file mode 100644 index 7d2bed1426cd8..0000000000000 --- a/sycl/test-e2e/InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// TODO: enable when Jira ticket resolved -// XFAIL: * -// -// Check that full compilation works: -// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../tuple_vadd.cpp -o %t.out -// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out -// -// VISALTO enable run -// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out - -/* - * This tests is the same as InvokeSimd/spec/tuple_vadd.cpp, but compiles - * without optional subgroup attribute specified and intended to check that - * compiler is able to choose subgroup size correctly. - */ diff --git a/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp b/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp deleted file mode 100644 index 6c9d869b3684a..0000000000000 --- a/sycl/test-e2e/InvokeSimd/Spec/tuple.cpp +++ /dev/null @@ -1,147 +0,0 @@ -// TODO: enable when Jira ticket resolved -// XFAIL: * -// -// Check that full compilation works: -// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out -// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out -// -// VISALTO enable run -// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out - -// Tests invoke_simd support in the compiler/headers -/* Test case description: - * ---------------------- - * This is a minimal test case to test invoke_simd support for tuples, - * as defined in the invoke_simd spec. - * - * This test case simply creates a scalar tuple per work-item - * which gets implicitly vectorized into a - * tuple, simd>. Then, inside the ESIMD function, - * we simply get the first tuple element (simd) and return it. - * - * This test also runs with all types of VISA link time optimizations enabled. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include - -/* Subgroup size attribute is optional - * In case it is absent compiler decides what subgroup size to use - */ -#ifdef IMPL_SUBGROUP -#define SUBGROUP_ATTR -#else -#define SUBGROUP_ATTR [[sycl::reqd_sub_group_size(VL)]] -#endif - -using namespace sycl::ext::oneapi::experimental; -namespace esimd = sycl::ext::intel::esimd; -constexpr int VL = 16; - -__attribute__((always_inline)) esimd::simd -ESIMD_CALLEE(std::tuple, esimd::simd> tup, - esimd::simd a) SYCL_ESIMD_FUNCTION { - esimd::simd float_vector = std::get<0>(tup); - esimd::simd int_vector = std::get<1>(tup); - return float_vector; -} - -[[intel::device_indirectly_callable]] SYCL_EXTERNAL - simd __regcall SIMD_CALLEE( - std::tuple, simd> tup, - simd a) SYCL_ESIMD_FUNCTION; - -using namespace sycl; - -int main(void) { - constexpr unsigned Size = 1024; - constexpr unsigned GroupSize = 4 * VL; - - auto q = queue{gpu_selector_v}; - auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() - << "\n"; - auto ctxt = q.get_context(); - - float *A = - static_cast(malloc_shared(Size * sizeof(float), dev, ctxt)); - float *C = - static_cast(malloc_shared(Size * sizeof(float), dev, ctxt)); - - int *D = static_cast(malloc_shared(Size * sizeof(int), dev, ctxt)); - - for (unsigned i = 0; i < Size; ++i) { - A[i] = i; - C[i] = -1; - D[i] = 1; - } - - sycl::range<1> GlobalRange{Size}; - // Number of workitems in each workgroup. - sycl::range<1> LocalRange{GroupSize}; - - sycl::nd_range<1> Range(GlobalRange, LocalRange); - - try { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for(Range, [=](nd_item<1> ndi) SUBGROUP_ATTR { - sub_group sg = ndi.get_sub_group(); - group<1> g = ndi.get_group(); - uint32_t i = - sg.get_group_linear_id() * VL + g.get_group_linear_id() * GroupSize; - uint32_t wi_id = i + sg.get_local_id(); - - std::tuple tup(A[wi_id], D[wi_id]); - float res = invoke_simd(sg, SIMD_CALLEE, tup, A[wi_id]); - C[wi_id] = res; - }); - }); - e.wait(); - } catch (sycl::exception const &e) { - sycl::free(A, q); - sycl::free(C, q); - sycl::free(D, q); - - std::cout << "SYCL exception caught: " << e.what() << '\n'; - return e.code().value(); - } - - int err_cnt = 0; - - for (unsigned i = 0; i < Size; ++i) { - if (A[i] != C[i]) { - if (++err_cnt < 10) { - std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] - << "\n"; - } - } - } - if (err_cnt > 0) { - std::cout << " pass rate: " - << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" - << (Size - err_cnt) << "/" << Size << ")\n"; - } - - sycl::free(A, q); - sycl::free(C, q); - sycl::free(D, q); - - std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); - return err_cnt > 0 ? 1 : 0; -} - -[[intel::device_indirectly_callable]] SYCL_EXTERNAL - simd __regcall SIMD_CALLEE( - std::tuple, simd> tup, - simd a) SYCL_ESIMD_FUNCTION { - esimd::simd res = ESIMD_CALLEE(tup, a); - return res; -} diff --git a/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp b/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp deleted file mode 100644 index bdc5fe9a7e19b..0000000000000 --- a/sycl/test-e2e/InvokeSimd/Spec/tuple_return.cpp +++ /dev/null @@ -1,149 +0,0 @@ -// TODO: enable when Jira ticket resolved -// XFAIL: * -// -// Check that full compilation works: -// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out -// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out -// -// VISALTO enable run -// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out - -// Tests invoke_simd support in the compiler/headers -/* Test case purpose: - * ---------------------- - * To test returning a tuple from invoke_simd. - * - * Test case description: - * ---------------------- - * This test case performs a vector add of A and B by passing A[wi_id] and - * B[wi_id] to an invoke_simd callee which simply combines these into a - * tuple and returns it. Then, the indivual values a and b are gotten - * back out of the tuple, added together, and stored in C[wi_id]. - * - * This test also runs with all types of VISA link time optimizations enabled. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include - -/* Subgroup size attribute is optional - * In case it is absent compiler decides what subgroup size to use - */ -#ifdef IMPL_SUBGROUP -#define SUBGROUP_ATTR -#else -#define SUBGROUP_ATTR [[sycl::reqd_sub_group_size(VL)]] -#endif - -using namespace sycl::ext::oneapi::experimental; -namespace esimd = sycl::ext::intel::esimd; -constexpr int VL = 16; - -__attribute__((always_inline)) -std::tuple, esimd::simd> -ESIMD_CALLEE(esimd::simd va, - esimd::simd vb) SYCL_ESIMD_FUNCTION { - std::tuple, esimd::simd> tup(va, vb); - return tup; -} - -[[intel::device_indirectly_callable]] SYCL_EXTERNAL - std::tuple, simd> __regcall SIMD_CALLEE( - simd va, simd vb) SYCL_ESIMD_FUNCTION; - -using namespace sycl; - -int main(void) { - constexpr unsigned Size = 1024; - constexpr unsigned GroupSize = 4 * VL; - - auto q = queue{gpu_selector_v}; - auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() - << "\n"; - auto ctxt = q.get_context(); - - float *A = - static_cast(malloc_shared(Size * sizeof(float), dev, ctxt)); - float *B = - static_cast(malloc_shared(Size * sizeof(float), dev, ctxt)); - float *C = - static_cast(malloc_shared(Size * sizeof(float), dev, ctxt)); - - for (unsigned i = 0; i < Size; ++i) { - A[i] = B[i] = i; - C[i] = -1; - } - - sycl::range<1> GlobalRange{Size}; - // Number of workitems in each workgroup. - sycl::range<1> LocalRange{GroupSize}; - - sycl::nd_range<1> Range(GlobalRange, LocalRange); - - try { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for(Range, [=](nd_item<1> ndi) SUBGROUP_ATTR { - sub_group sg = ndi.get_sub_group(); - group<1> g = ndi.get_group(); - uint32_t i = - sg.get_group_linear_id() * VL + g.get_group_linear_id() * GroupSize; - uint32_t wi_id = i + sg.get_local_id(); - - std::tuple tup = - invoke_simd(sg, SIMD_CALLEE, A[wi_id], B[wi_id]); - float a = std::get<0>(tup); - float b = std::get<1>(tup); - float res = a + b; - C[wi_id] = res; - }); - }); - e.wait(); - } catch (sycl::exception const &e) { - sycl::free(A, q); - sycl::free(B, q); - sycl::free(C, q); - - std::cout << "SYCL exception caught: " << e.what() << '\n'; - return e.code().value(); - } - - int err_cnt = 0; - - for (unsigned i = 0; i < Size; ++i) { - if (A[i] + B[i] != C[i]) { - if (++err_cnt < 10) { - std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] - << " + " << B[i] << "\n"; - } - } - } - if (err_cnt > 0) { - std::cout << " pass rate: " - << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" - << (Size - err_cnt) << "/" << Size << ")\n"; - } - - sycl::free(A, q); - sycl::free(B, q); - sycl::free(C, q); - - std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); - return err_cnt > 0 ? 1 : 0; -} - -[[intel::device_indirectly_callable]] SYCL_EXTERNAL - std::tuple, simd> __regcall SIMD_CALLEE( - simd va, simd vb) SYCL_ESIMD_FUNCTION { - std::tuple, esimd::simd> res = - ESIMD_CALLEE(va, vb); - return res; -} diff --git a/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp b/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp deleted file mode 100644 index 38f654ea769b3..0000000000000 --- a/sycl/test-e2e/InvokeSimd/Spec/tuple_vadd.cpp +++ /dev/null @@ -1,151 +0,0 @@ -// TODO: enable when Jira ticket resolved -// XFAIL: * -// -// Check that full compilation works: -// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out -// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out -// -// VISALTO enable run -// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out - -// Tests invoke_simd support in the compiler/headers -/* Test case purpose: - * ------------------ - * To test invoke_simd support for tuples, as defined in the invoke_simd spec. - * - * Test case description: - * ---------------------- - * This test case performs a vector addition of 2 vectors of float, a and b. - * Instead of passing in 2 separate simd to the SIMD and ESIMD - * functions, we pass in a single tuple, simd> and - * then get and add these tuple elements, - * - * Implementation notes: - * -------------------- - * I've included an equivalent set of regular (non-tuple) vadd functions to - * verify that the overall logic of the program is correct. Switch the - * invoke_simd() call to see that the regular vadd works correctly. - * - * This test also runs with all types of VISA link time optimizations enabled. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include - -/* Subgroup size attribute is optional - * In case it is absent compiler decides what subgroup size to use - */ -#ifdef IMPL_SUBGROUP -#define SUBGROUP_ATTR -#else -#define SUBGROUP_ATTR [[sycl::reqd_sub_group_size(VL)]] -#endif - -using namespace sycl::ext::oneapi::experimental; -namespace esimd = sycl::ext::intel::esimd; -constexpr int VL = 16; - -__attribute__((always_inline)) esimd::simd ESIMD_CALLEE_TUPLE_VADD( - std::tuple, esimd::simd> tup) - SYCL_ESIMD_FUNCTION { - esimd::simd va = std::get<0>(tup); - esimd::simd vb = std::get<1>(tup); - return va + vb; -} - -[[intel::device_indirectly_callable]] SYCL_EXTERNAL - simd __regcall SIMD_CALLEE_TUPLE_VADD( - std::tuple, simd> tup) SYCL_ESIMD_FUNCTION; - -using namespace sycl; - -int main(void) { - constexpr unsigned Size = 1024; - constexpr unsigned GroupSize = 4 * VL; - - auto q = queue{gpu_selector_v}; - auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() - << "\n"; - auto ctxt = q.get_context(); - - float *A = - static_cast(malloc_shared(Size * sizeof(float), dev, ctxt)); - float *B = - static_cast(malloc_shared(Size * sizeof(float), dev, ctxt)); - float *C = - static_cast(malloc_shared(Size * sizeof(float), dev, ctxt)); - - for (unsigned i = 0; i < Size; ++i) { - A[i] = B[i] = i; - C[i] = -1; - } - - sycl::range<1> GlobalRange{Size}; - // Number of workitems in each workgroup. - sycl::range<1> LocalRange{GroupSize}; - - sycl::nd_range<1> Range(GlobalRange, LocalRange); - - try { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for(Range, [=](nd_item<1> ndi) SUBGROUP_ATTR { - sub_group sg = ndi.get_sub_group(); - group<1> g = ndi.get_group(); - uint32_t i = - sg.get_group_linear_id() * VL + g.get_group_linear_id() * GroupSize; - uint32_t wi_id = i + sg.get_local_id(); - - std::tuple tup(A[wi_id], B[wi_id]); - float res = invoke_simd(sg, SIMD_CALLEE_TUPLE_VADD, tup); - C[wi_id] = res; - }); - }); - e.wait(); - } catch (sycl::exception const &e) { - sycl::free(A, q); - sycl::free(B, q); - sycl::free(C, q); - - std::cout << "SYCL exception caught: " << e.what() << '\n'; - return e.code().value(); - } - - int err_cnt = 0; - - for (unsigned i = 0; i < Size; ++i) { - if (A[i] + B[i] != C[i]) { - if (++err_cnt < 10) { - std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] - << " + " << B[i] << "\n"; - } - } - } - if (err_cnt > 0) { - std::cout << " pass rate: " - << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" - << (Size - err_cnt) << "/" << Size << ")\n"; - } - - sycl::free(A, q); - sycl::free(B, q); - sycl::free(C, q); - - std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); - return err_cnt > 0 ? 1 : 0; -} - -[[intel::device_indirectly_callable]] SYCL_EXTERNAL - simd __regcall SIMD_CALLEE_TUPLE_VADD( - std::tuple, simd> tup) SYCL_ESIMD_FUNCTION { - esimd::simd res = ESIMD_CALLEE_TUPLE_VADD(tup); - return res; -} diff --git a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp index 5c2a9edc4682c..eb27a5b76c9d9 100644 --- a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp +++ b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp @@ -1,9 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Group algorithms are not supported on NVidia. -// XFAIL: hip_nvidia - // This test only checks that the method queue::parallel_for() accepting // reduction, can be properly translated into queue::submit + parallel_for(). diff --git a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp index b8ab5f0952e13..6e71cc363ad98 100644 --- a/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp +++ b/sycl/test-e2e/Reduction/reduction_nd_conditional.cpp @@ -1,11 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// Error message `The implementation handling -// parallel_for with reduction requires work group size not bigger than 1` on -// Nvidia. -// XFAIL: hip_nvidia - // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Reduction/reduction_nd_dw.cpp b/sycl/test-e2e/Reduction/reduction_nd_dw.cpp index 7bfab8d98cdf1..dd45a974b0950 100644 --- a/sycl/test-e2e/Reduction/reduction_nd_dw.cpp +++ b/sycl/test-e2e/Reduction/reduction_nd_dw.cpp @@ -1,8 +1,5 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// -// Group algorithms are not supported on Nvidia. -// XFAIL: hip_nvidia // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp index a4ebe61e84ed0..7e7b9fac72ecf 100644 --- a/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp +++ b/sycl/test-e2e/Reduction/reduction_nd_ext_double.cpp @@ -1,11 +1,5 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// -// Error -// message `The implementation handling parallel_for with reduction requires -// work group size not bigger than 1` on Nvidia. - -// XFAIL: hip_nvidia // This test performs basic checks of parallel_for(nd_range, reduction, func) // used with 'double' type. diff --git a/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp b/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp index 03b9ebe7ca423..8277360d39059 100644 --- a/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp +++ b/sycl/test-e2e/Reduction/reduction_nd_ext_half.cpp @@ -1,11 +1,6 @@ // REQUIRES: aspect-fp16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// -// Error message on Nvidia: -// `The implementation handling parallel_for with reduction requires -// work group size not bigger than 1`. -// XFAIL: hip_nvidia // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp index 68f1ee3397576..474c67aaf9b47 100644 --- a/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp +++ b/sycl/test-e2e/Reduction/reduction_nd_queue_shortcut.cpp @@ -1,9 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Group algorithms are not supported on NVidia. -// XFAIL: hip_nvidia - // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Reduction/reduction_nd_rw.cpp b/sycl/test-e2e/Reduction/reduction_nd_rw.cpp index 901188866a41a..d329033446191 100644 --- a/sycl/test-e2e/Reduction/reduction_nd_rw.cpp +++ b/sycl/test-e2e/Reduction/reduction_nd_rw.cpp @@ -1,8 +1,5 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// -// `Group algorithms are not supported on host device.` on Nvidia. -// XFAIL: hip_nvidia // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp index f06aab2f517a1..1d94e6faaacef 100644 --- a/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_queue_shortcut.cpp @@ -1,9 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Group algorithms are not supported on NVidia. -// XFAIL: hip_nvidia - // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp b/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp index 2d1cae039550a..9c9f366f6a588 100644 --- a/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_usm_dw.cpp @@ -1,9 +1,5 @@ // RUN: %{build} -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %} // RUN: %{run} %t.out -// -// Error message `Group algorithms are not -// supported on host device.` on Nvidia. -// XFAIL: hip_nvidia // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp index e6e5252c57577..023e78fe5e85d 100644 --- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp +++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp @@ -1,8 +1,5 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// -// `Group algorithms are not supported on host device.` on Nvidia. -// XFAIL: hip_nvidia // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Reduction/reduction_usm.cpp b/sycl/test-e2e/Reduction/reduction_usm.cpp index 9a27956982117..24dd84f66236f 100644 --- a/sycl/test-e2e/Reduction/reduction_usm.cpp +++ b/sycl/test-e2e/Reduction/reduction_usm.cpp @@ -1,8 +1,5 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// -// `Group algorithms are not supported on host device.` on Nvidia. -// XFAIL: hip_nvidia // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Reduction/reduction_usm_dw.cpp b/sycl/test-e2e/Reduction/reduction_usm_dw.cpp index 5b36fcba18e56..6d00451a0701e 100644 --- a/sycl/test-e2e/Reduction/reduction_usm_dw.cpp +++ b/sycl/test-e2e/Reduction/reduction_usm_dw.cpp @@ -1,9 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// `Group algorithms are not supported on host device` on Nvidia. -// XFAIL: hip_nvidia - // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows diff --git a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp index e1309ee0edb1a..0c0e1750805f2 100644 --- a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp +++ b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp @@ -1,7 +1,5 @@ // RUN: %{build} -o %t.out // RUN: env SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s -// -// XFAIL: hip_nvidia // The tested functionality is disabled with Level Zero until it is supported by // the adapter. diff --git a/sycl/test-e2e/Scheduler/MemObjRemapping.cpp b/sycl/test-e2e/Scheduler/MemObjRemapping.cpp index f4de8a56217cd..55181dfbcc565 100644 --- a/sycl/test-e2e/Scheduler/MemObjRemapping.cpp +++ b/sycl/test-e2e/Scheduler/MemObjRemapping.cpp @@ -1,7 +1,6 @@ // RUN: %{build} -Wno-error=deprecated-declarations -o %t.out // RUN: env SYCL_HOST_UNIFIED_MEMORY=1 SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s -// -// XFAIL: hip_nvidia + #include #include #include diff --git a/sycl/test-e2e/Scheduler/MultipleDevices.cpp b/sycl/test-e2e/Scheduler/MultipleDevices.cpp index 3641e5d58b5ad..3976512e2d6e7 100644 --- a/sycl/test-e2e/Scheduler/MultipleDevices.cpp +++ b/sycl/test-e2e/Scheduler/MultipleDevices.cpp @@ -1,7 +1,5 @@ // RUN: %{build} -o %t.out // RUN: %{run-unfiltered-devices} %t.out -// -// XFAIL: hip_nvidia //===- MultipleDevices.cpp - Test checking multi-device execution --------===// // diff --git a/sycl/test-e2e/Scheduler/ReleaseResourcesTest.cpp b/sycl/test-e2e/Scheduler/ReleaseResourcesTest.cpp index 880ec728f8951..fb634c832114d 100644 --- a/sycl/test-e2e/Scheduler/ReleaseResourcesTest.cpp +++ b/sycl/test-e2e/Scheduler/ReleaseResourcesTest.cpp @@ -1,7 +1,5 @@ // RUN: %{build} -Wno-error=unused-command-line-argument -fsycl-dead-args-optimization -o %t.out // RUN: env SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s %if !windows %{--check-prefix=CHECK-RELEASE%} -// -// XFAIL: hip_nvidia //==------------------- ReleaseResourcesTests.cpp --------------------------==// // diff --git a/sycl/test-e2e/Tracing/buffer_printers.cpp b/sycl/test-e2e/Tracing/buffer_printers.cpp index 4e29cbb02ff6c..d712b0009c1ae 100644 --- a/sycl/test-e2e/Tracing/buffer_printers.cpp +++ b/sycl/test-e2e/Tracing/buffer_printers.cpp @@ -1,7 +1,5 @@ // RUN: %{build} -o %t.out // RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s -// -// XFAIL: hip_nvidia #include #include diff --git a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp index c867ce1f4f420..66257a935610e 100644 --- a/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp +++ b/sycl/test/e2e_test_requirements/no-xfail-without-tracker.cpp @@ -51,40 +51,21 @@ // tests to match the required format and in that case you should just update // (i.e. reduce) the number and the list below. // -// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 77 +// NUMBER-OF-XFAIL-WITHOUT-TRACKER: 41 // // List of improperly XFAIL-ed tests. // Remove the CHECK once the test has been properly XFAIL-ed. // // CHECK: AddressSanitizer/nullpointer/private_nullptr.cpp // CHECK-NEXT: Basic/aspects.cpp -// CHECK-NEXT: Basic/buffer/reinterpret.cpp // CHECK-NEXT: Basic/device_event.cpp // CHECK-NEXT: Basic/diagnostics/handler.cpp // CHECK-NEXT: Basic/max_linear_work_group_size_props.cpp // CHECK-NEXT: Basic/max_work_group_size_props.cpp // CHECK-NEXT: Basic/partition_supported.cpp -// CHECK-NEXT: Basic/queue/queue.cpp -// CHECK-NEXT: Basic/queue/release.cpp -// CHECK-NEXT: Basic/span.cpp -// CHECK-NEXT: Basic/stream/auto_flush.cpp -// CHECK-NEXT: DeprecatedFeatures/queue_old_interop.cpp -// CHECK-NEXT: DeviceCodeSplit/split-per-kernel.cpp -// CHECK-NEXT: DeviceCodeSplit/split-per-source-main.cpp // CHECK-NEXT: DeviceLib/assert-windows.cpp // CHECK-NEXT: ESIMD/hardware_dispatch.cpp -// CHECK-NEXT: GroupAlgorithm/root_group.cpp -// CHECK-NEXT: GroupLocalMemory/group_local_memory.cpp -// CHECK-NEXT: GroupLocalMemory/no_early_opt.cpp // CHECK-NEXT: InlineAsm/asm_multiple_instructions.cpp -// CHECK-NEXT: InvokeSimd/Feature/ImplicitSubgroup/invoke_simd_struct.cpp -// CHECK-NEXT: InvokeSimd/Feature/invoke_simd_struct.cpp -// CHECK-NEXT: InvokeSimd/Spec/ImplicitSubgroup/tuple.cpp -// CHECK-NEXT: InvokeSimd/Spec/ImplicitSubgroup/tuple_return.cpp -// CHECK-NEXT: InvokeSimd/Spec/ImplicitSubgroup/tuple_vadd.cpp -// CHECK-NEXT: InvokeSimd/Spec/tuple.cpp -// CHECK-NEXT: InvokeSimd/Spec/tuple_return.cpp -// CHECK-NEXT: InvokeSimd/Spec/tuple_vadd.cpp // CHECK-NEXT: KernelAndProgram/kernel-bundle-merge-options.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_annotated_ptr.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -108,28 +89,11 @@ // CHECK-NEXT: Printf/mixed-address-space.cpp // CHECK-NEXT: Printf/percent-symbol.cpp // CHECK-NEXT: Reduction/reduction_big_data.cpp -// CHECK-NEXT: Reduction/reduction_nd_N_queue_shortcut.cpp -// CHECK-NEXT: Reduction/reduction_nd_conditional.cpp -// CHECK-NEXT: Reduction/reduction_nd_dw.cpp -// CHECK-NEXT: Reduction/reduction_nd_ext_double.cpp -// CHECK-NEXT: Reduction/reduction_nd_ext_half.cpp -// CHECK-NEXT: Reduction/reduction_nd_queue_shortcut.cpp // CHECK-NEXT: Reduction/reduction_nd_reducer_skip.cpp -// CHECK-NEXT: Reduction/reduction_nd_rw.cpp -// CHECK-NEXT: Reduction/reduction_range_queue_shortcut.cpp -// CHECK-NEXT: Reduction/reduction_range_usm_dw.cpp // CHECK-NEXT: Reduction/reduction_reducer_op_eq.cpp -// CHECK-NEXT: Reduction/reduction_span_pack.cpp -// CHECK-NEXT: Reduction/reduction_usm.cpp -// CHECK-NEXT: Reduction/reduction_usm_dw.cpp // CHECK-NEXT: Regression/build_log.cpp // CHECK-NEXT: Regression/complex_global_object.cpp // CHECK-NEXT: Regression/context_is_destroyed_after_exception.cpp // CHECK-NEXT: Regression/kernel_bundle_ignore_sycl_external.cpp // CHECK-NEXT: Regression/multiple-targets.cpp // CHECK-NEXT: Regression/reduction_resource_leak_dw.cpp -// CHECK-NEXT: Scheduler/InOrderQueueDeps.cpp -// CHECK-NEXT: Scheduler/MemObjRemapping.cpp -// CHECK-NEXT: Scheduler/MultipleDevices.cpp -// CHECK-NEXT: Scheduler/ReleaseResourcesTest.cpp -// CHECK-NEXT: Tracing/buffer_printers.cpp