diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 481ac16ec4f26..0a61410b07d18 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -311,6 +311,27 @@ extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixPrefetchINTEL( T *Ptr, uint32_t NumRows, uint32_t NumCols, unsigned int CacheLevel, __spv::MatrixLayout Layout, size_t Stride); +template +extern __DPCPP_SYCL_EXTERNAL + __spv::__spirv_JointMatrixINTEL * + __spirv_CooperativeMatrixLoadOffsetINTEL(T *Ptr, int32_t RowIndex, + int32_t ColIndex, + __spv::MatrixLayout Layout = L, + std::size_t Stride = 0, + int MemOperand = 0); + +template +extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixStoreOffsetINTEL( + T *Ptr, int32_t RowIndex, int32_t ColIndex, + __spv::__spirv_JointMatrixINTEL *Object, + __spv::MatrixLayout Layout = L, std::size_t Stride = 0, int MemOperand = 0); + #ifndef __SPIRV_BUILTIN_DECLARATIONS__ #error \ "SPIR-V built-ins are not available. Please set -fdeclare-spirv-builtins flag." diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp index a3749a0137e78..1e5a573ec2c3f 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp @@ -1120,6 +1120,101 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store_checked( } // End out-of-bounds API +template < + typename Group, typename T, typename Tp, + sycl::ext::oneapi::experimental::matrix::use Use, size_t NumRows, + size_t NumCols, sycl::ext::oneapi::experimental::matrix::layout Layout, + access::address_space Space, access::decorated IsDecorated, + std::enable_if_t = true> +inline __SYCL_ALWAYS_INLINE void +joint_matrix_store(Group, + const sycl::ext::oneapi::experimental::matrix::joint_matrix< + Group, Tp, Use, NumRows, NumCols, Layout> &Src, + size_t RowIndex, size_t ColIndex, + multi_ptr BaseDst, size_t Stride) { +#if defined(__SYCL_DEVICE_ONLY__) + static_assert(Space != access::address_space::private_space, + "Joint Matrix doesn't support store to private memory!"); +#if defined(__NVPTX__) + std::ignore = Src; + std::ignore = BaseDst; + std::ignore = Stride; + throw exception( + make_error_code(errc::runtime), + "This version of the matrix extension is only currently supported on " + "intel devices"); +#else + // intel's impl + using DecorT = typename sycl::detail::DecoratedType::type; + DecorT *Ptr = sycl::detail::getDecorated(BaseDst); + __spirv_CooperativeMatrixStoreOffsetINTEL< + DecorT, Tp, NumRows, NumCols, + sycl::ext::oneapi::experimental::matrix::spv_matrix_use_traits< + Use>::value, + sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< + Layout>::value>( + Ptr, RowIndex, ColIndex, Src.spvm, + sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< + Layout>::value, + Stride); +#endif // defined(__NVPTX__) +#else + std::ignore = Src; + std::ignore = BaseDst; + std::ignore = Stride; + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // defined(__SYCL_DEVICE_ONLY__) +} +template < + typename Group, typename T, typename Tp, + sycl::ext::oneapi::experimental::matrix::use Use, size_t NumRows, + size_t NumCols, sycl::ext::oneapi::experimental::matrix::layout Layout, + typename PropertyListT, + std::enable_if_t = true> +inline __SYCL_ALWAYS_INLINE void joint_matrix_store( + Group, + const sycl::ext::oneapi::experimental::matrix::joint_matrix< + Group, Tp, Use, NumRows, NumCols, Layout> + Src, + ext::oneapi::experimental::annotated_ptr BaseDst, + size_t RowIndex, size_t ColIndex, size_t Stride) { +#if defined(__SYCL_DEVICE_ONLY__) +#if defined(__NVPTX__) + std::ignore = Src; + std::ignore = BaseDst; + std::ignore = Stride; + throw exception( + make_error_code(errc::runtime), + "This version of the matrix extension is only currently supported on " + "intel devices"); +#else + // intel's impl + T *Ptr = BaseDst.get(); + __spirv_CooperativeMatrixStoreOffsetINTEL< + T, Tp, NumRows, NumCols, + sycl::ext::oneapi::experimental::matrix::spv_matrix_use_traits< + Use>::value, + sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< + Layout>::value>( + Ptr, Src.spvm, + sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits< + Layout>::value, + RowIndex, ColIndex, Stride); +#endif // defined(__NVPTX__) +#else + std::ignore = Src; + std::ignore = BaseDst; + std::ignore = Stride; + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // defined(__SYCL_DEVICE_ONLY__) +} + } // namespace intel::experimental::matrix } // namespace ext diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index d3d57f24c56e6..ff49f4d2ad067 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -377,6 +377,181 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load( #endif // defined(__SYCL_DEVICE_ONLY__) } +// Begin offset load overloads +template < + typename Group, typename S, typename T, size_t NumRows, size_t NumCols, + access::address_space Space, access::decorated IsDecorated, + std::enable_if_t>::value, bool> = + true> +inline __SYCL_ALWAYS_INLINE void joint_matrix_load( + Group Sg, + joint_matrix &Res, + multi_ptr BaseSrc, size_t RowIndex, size_t ColIndex, + size_t Stride, sycl::ext::oneapi::experimental::matrix::layout Layout) { +#if defined(__SYCL_DEVICE_ONLY__) + static_assert(Space != access::address_space::private_space, + "Joint Matrix doesn't support load from private memory!"); +#if defined(__NVPTX__) + std::ignore = Sg; + auto LoadStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex + : RowIndex + ColIndex * Stride; + sycl::ext::oneapi::detail::load_accumulator_cuda( + Res.matrix_impl, BaseSrc + LoadStride, Stride, Layout); +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + auto LoadStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex + : RowIndex + ColIndex * Stride; + sycl::ext::oneapi::detail::load_accumulator_hip( + Res.matrix_impl, BaseSrc + LoadStride, Stride, Layout, Sg); +#else + std::ignore = Sg; + using DecorT = typename sycl::detail::DecoratedType::type; + DecorT *Ptr = sycl::detail::getDecorated(BaseSrc); + Res.spvm = __spirv_CooperativeMatrixLoadOffsetINTEL< + DecorT, S, NumRows, NumCols, + spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, RowIndex, ColIndex, sycl::detail::joint_matrix_layout_to_spv(Layout), + Stride); +#endif // defined(__NVPTX__) +#else + std::ignore = Sg; + std::ignore = Res; + std::ignore = BaseSrc; + std::ignore = Stride; + std::ignore = Layout; + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // defined(__SYCL_DEVICE_ONLY__) +} + +template < + typename Group, typename S, typename T, use Use, size_t NumRows, + size_t NumCols, matrix::layout Layout, access::address_space Space, + access::decorated IsDecorated, + std::enable_if_t>::value || + (std::is_same::value && + std::is_same, float>::value), + bool> = true> +inline __SYCL_ALWAYS_INLINE void +joint_matrix_load(Group Sg, + joint_matrix &Res, + multi_ptr BaseSrc, size_t RowIndex, + size_t ColIndex, size_t Stride) { +#if defined(__SYCL_DEVICE_ONLY__) + static_assert(Space != access::address_space::private_space, + "Joint Matrix doesn't support load from private memory!"); +#if defined(__NVPTX__) + std::ignore = Sg; + auto LoadStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex + : RowIndex + ColIndex * Stride; + sycl::ext::oneapi::detail::load_multiplicand_cuda( + Res.matrix_impl, BaseSrc + LoadStride, Stride); +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + auto LoadStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex + : RowIndex + ColIndex * Stride; + sycl::ext::oneapi::detail::load_multiplicand_hip( + Res.matrix_impl, BaseSrc + LoadStride, Stride, Sg); +#else + std::ignore = Sg; + using DecorT = typename sycl::detail::DecoratedType::type; + DecorT *Ptr = sycl::detail::getDecorated(BaseSrc); + Res.spvm = __spirv_CooperativeMatrixLoadOffsetINTEL< + DecorT, S, NumRows, NumCols, spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, RowIndex, ColIndex, spv_matrix_layout_traits::value, Stride); +#endif // defined(__NVPTX__) +#else + std::ignore = Sg; + std::ignore = Res; + std::ignore = BaseSrc; + std::ignore = Stride; + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // defined(__SYCL_DEVICE_ONLY__) +} + +template >::value, + bool> = true> +inline __SYCL_ALWAYS_INLINE void joint_matrix_load( + Group Sg, + joint_matrix &Res, + ext::oneapi::experimental::annotated_ptr BaseSrc, + size_t RowIndex, size_t ColIndex, size_t Stride, + sycl::ext::oneapi::experimental::matrix::layout Layout) { +#if defined(__SYCL_DEVICE_ONLY__) +#if defined(__NVPTX__) + std::ignore = Sg; + throw exception(make_error_code(errc::runtime), + "Use joint_matrix_load on multi_ptr on Nvidia device."); +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + throw exception(make_error_code(errc::runtime), + "Use joint_matrix_load on multi_ptr on AMD device."); +#else + std::ignore = Sg; + T *Ptr = BaseSrc.get(); + Res.spvm = __spirv_CooperativeMatrixLoadOffsetINTEL< + T, S, NumRows, NumCols, spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, RowIndex, ColIndex, Stride, + sycl::detail::joint_matrix_layout_to_spv(Layout), + spv_scope_traits::value); +#endif // defined(__NVPTX__) +#else + std::ignore = Sg; + std::ignore = Res; + std::ignore = BaseSrc; + std::ignore = Stride; + std::ignore = Layout; + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // defined(__SYCL_DEVICE_ONLY__) +} + +template < + typename Group, typename S, typename T, use Use, size_t NumRows, + size_t NumCols, matrix::layout Layout, typename PropertyListT, + std::enable_if_t>::value || + (std::is_same::value && + std::is_same, float>::value), + bool> = true> +inline __SYCL_ALWAYS_INLINE void joint_matrix_load( + Group Sg, joint_matrix &Res, + ext::oneapi::experimental::annotated_ptr BaseSrc, + size_t RowIndex, size_t ColIndex, size_t Stride) { +#if defined(__SYCL_DEVICE_ONLY__) +#if defined(__NVPTX__) + std::ignore = Sg; + throw exception(make_error_code(errc::runtime), + "Use joint_matrix_load on multi_ptr on Nvidia device."); +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + throw exception(make_error_code(errc::runtime), + "Use joint_matrix_load on multi_ptr on AMD device."); +#else + std::ignore = Sg; + T *Ptr = BaseSrc.get(); + Res.spvm = __spirv_CooperativeMatrixLoadOffsetINTEL< + T, S, NumRows, NumCols, spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, RowIndex, ColIndex, Stride, spv_matrix_layout_traits::value, + spv_scope_traits::value); +#endif // defined(__NVPTX__) +#else + std::ignore = Sg; + std::ignore = Res; + std::ignore = BaseSrc; + std::ignore = Stride; + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // defined(__SYCL_DEVICE_ONLY__) +} +// End offset load overloads + template inline __SYCL_ALWAYS_INLINE void joint_matrix_store( @@ -472,6 +647,92 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store( #endif // defined(__SYCL_DEVICE_ONLY__) } +// Begin offset store overloads +template +inline __SYCL_ALWAYS_INLINE void joint_matrix_store( + Group Sg, + const joint_matrix + &Src, + multi_ptr BaseDst, size_t RowIndex, size_t ColIndex, + size_t Stride, sycl::ext::oneapi::experimental::matrix::layout Layout) { +#if defined(__SYCL_DEVICE_ONLY__) + static_assert(Space != access::address_space::private_space, + "Joint Matrix doesn't support store to private memory!"); +#if defined(__NVPTX__) + std::ignore = Sg; + auto StoreStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex + : RowIndex + ColIndex * Stride; + sycl::ext::oneapi::detail::joint_matrix_store_cuda( + Src.matrix_impl, BaseDst + StoreStride, Stride, Layout); +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + auto StoreStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex + : RowIndex + ColIndex * Stride; + sycl::ext::oneapi::detail::joint_matrix_store_hip( + Src.matrix_impl, BaseDst + StoreStride, Stride, Layout, Sg); +#else + std::ignore = Sg; + using DecorT = typename sycl::detail::DecoratedType::type; + DecorT *Ptr = sycl::detail::getDecorated(BaseDst); + __spirv_CooperativeMatrixStoreOffsetINTEL< + DecorT, T, NumRows, NumCols, + spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, RowIndex, ColIndex, Src.spvm, + sycl::detail::joint_matrix_layout_to_spv(Layout), Stride); +#endif // defined(__NVPTX__) +#else + std::ignore = Sg; + std::ignore = Src; + std::ignore = BaseDst; + std::ignore = Stride; + std::ignore = Layout; + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // defined(__SYCL_DEVICE_ONLY__) +} + +template +inline __SYCL_ALWAYS_INLINE void joint_matrix_store( + Group Sg, + const joint_matrix + &Src, + ext::oneapi::experimental::annotated_ptr BaseDst, + size_t RowIndex, size_t ColIndex, size_t Stride, + sycl::ext::oneapi::experimental::matrix::layout Layout) { +#if defined(__SYCL_DEVICE_ONLY__) +#if defined(__NVPTX__) + std::ignore = Sg; + throw exception(make_error_code(errc::runtime), + "Use joint_matrix_store on multi_ptr on Nvidia device."); +#elif defined(__HIP_PLATFORM_AMD_MFMA__) + throw exception(make_error_code(errc::runtime), + "Use joint_matrix_store on multi_ptr on AMD device."); +#else + std::ignore = Sg; + T *Ptr = BaseDst.get(); + __spirv_CooperativeMatrixStoreKHR< + T, T, NumRows, NumCols, spv_matrix_use_traits::value, + spv_matrix_layout_traits::value>( + Ptr, Src.spvm, RowIndex, ColIndex, + sycl::detail::joint_matrix_layout_to_spv(Layout), Stride); +#endif // defined(__NVPTX__) +#else + std::ignore = Sg; + std::ignore = Src; + std::ignore = BaseDst; + std::ignore = Stride; + std::ignore = Layout; + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // defined(__SYCL_DEVICE_ONLY__) +} +// End offset store overloads template diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp index d1fbd4cbda15b..638ad25c46839 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp @@ -10,5 +10,8 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// RUN: %{build} -o %toff.out -DOFFSET +// RUN: %{run} %toff.out + #include "common.hpp" #include "joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp index fc1fc1963e047..2560dfd46e2c6 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp @@ -48,13 +48,29 @@ void matrix_multiply(big_matrix &C, big_matrix &A, layout::ext_intel_packed> sub_b; joint_matrix sub_c; - +#ifdef OFFSET + joint_matrix_load( + sg, sub_c, accC.template get_multi_ptr(), + sg_startx * TM, sg_starty / sg_size * TN, N, layout::row_major); +#else joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); +#endif for (int k = 0; k < K / TK; k += 1) { // +#ifdef OFFSET + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr(), + sg_startx * TM, k * TK, K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr(), + k * TK / 2, sg_starty / sg_size * TN * 2, N * 2); + +#else joint_matrix_load( sg, sub_a, accA.template get_multi_ptr() + @@ -65,13 +81,20 @@ void matrix_multiply(big_matrix &C, big_matrix &A, accB.template get_multi_ptr() + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, N * 2); +#endif joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } +#ifdef OFFSET + joint_matrix_store( + sg, sub_c, accC.template get_multi_ptr(), + sg_startx * TM, sg_starty / sg_size * TN, N, layout::row_major); +#else joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); +#endif }); // parallel for }).wait(); } diff --git a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp index 230a20a62e1c7..453a04eb99d09 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp @@ -167,28 +167,43 @@ void test(queue &q) { sub_c; joint_matrix sub_d; auto stride_C = layout_C == layout::row_major ? Big_N : Big_M; +#ifdef OFFSET + + joint_matrix_load( + sg, sub_c, accC.template get_multi_ptr(), + m * M, n * N, stride_C, layout_C); +#else auto load_stride_C = layout_C == layout::row_major ? (m * M) * Big_N + n * N : (m * M) + n * N * Big_M; - joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + load_stride_C, stride_C, layout_C); - +#endif auto stride_A = layout_A == layout::row_major ? Big_K : Big_M; auto stride_B = layout_B == layout::row_major ? Big_N : Big_K; // k = row/col id of current submatrix of BIG A/B matrices for (int k = 0; k < Sub_Tiles_K; k++) { +#ifdef OFFSET + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr(), m * M, + k * K, stride_A); + + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr(), k * K, + n * N, stride_B); +#else auto load_stride_A = layout_A == layout::row_major ? (k * K) + (m * M * Big_K) : (k * K * Big_M) + (m * M); auto load_stride_B = layout_B == layout::row_major ? (k * K * Big_N) + (n * N) : (k * K) + (n * N * Big_K); - joint_matrix_load( sg, sub_a, accA.template get_multi_ptr() + @@ -200,7 +215,7 @@ void test(queue &q) { accB.template get_multi_ptr() + load_stride_B, stride_B); - +#endif // round values to correct precision if using tf32 if constexpr (std::is_same::value) { auto round_lambda = [](auto &x) { x = round_to_tf32(x); }; @@ -211,11 +226,17 @@ void test(queue &q) { joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c); joint_matrix_copy(sg, sub_d, sub_c); } +#ifdef OFFSET joint_matrix_store( + sg, sub_d, accD.template get_multi_ptr(), + m * M, n * N, stride_C, layout_C); +#else + joint_matrix_store( sg, sub_d, accD.template get_multi_ptr() + load_stride_C, stride_C, layout_C); +#endif }); }); q.wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp index b383c07f018a6..8e9afe414874c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp @@ -10,6 +10,9 @@ // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %t.out // RUN: %{run} %t.out // +// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET +// RUN: %{run} %toff.out +// // This tests the unified matrix extension interfaces for the cuda backend. // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx, // where sm_xx >= sm_70. diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp index 565bec3c69972..c0d26a9600409 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp @@ -10,6 +10,9 @@ // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_72 -o %t.out // RUN: %{run} %t.out // +// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET +// RUN: %{run} %toff.out +// // This tests the unified matrix extension interfaces for the cuda backend. // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx, // where sm_xx >= sm_72. diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp index c872665ba0532..94680aa2523a0 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp @@ -10,6 +10,9 @@ // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_80 -o %t.out // RUN: %{run} %t.out // +// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET +// RUN: %{run} %toff.out +// // This tests the unified matrix extension interfaces for the cuda backend. // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx, // where sm_xx >= sm_80.