diff --git a/sycl/include/sycl/__spirv/spirv_ops.hpp b/sycl/include/sycl/__spirv/spirv_ops.hpp index b001bc914b196..ad0c7a31d3519 100644 --- a/sycl/include/sycl/__spirv/spirv_ops.hpp +++ b/sycl/include/sycl/__spirv/spirv_ops.hpp @@ -84,15 +84,15 @@ extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixStoreCheckedINTEL( std::size_t Stride, size_t Height, size_t Width, size_t CoordX, size_t CoordY, __spv::MatrixLayout Layout = L, int MemOperand = 0); -template extern __DPCPP_SYCL_EXTERNAL - __spv::__spirv_CooperativeMatrixKHR * + __spv::__spirv_CooperativeMatrixKHR * __spirv_CooperativeMatrixMulAddKHR( __spv::__spirv_CooperativeMatrixKHR *A, __spv::__spirv_CooperativeMatrixKHR *B, diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp index b8484a077c5fc..379ebaf1a1063 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp @@ -263,6 +263,25 @@ class wi_element::value, + spv_scope_traits::value>(&M.spvm, idx); + union { + uint16_t intStorage; + sycl::ext::oneapi::bfloat16 floatValue; + }; + floatValue = *ExtractP; + return __spirv_ConvertBF16ToFINTEL(intStorage); +#else + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // __SYCL_DEVICE_ONLY__ + } + explicit operator bool() { #ifdef __SYCL_DEVICE_ONLY__ sycl::ext::oneapi::bfloat16 *ExtractP = @@ -295,6 +314,21 @@ class wi_element::value, + spv_scope_traits::value>(&M.spvm, idx); + *InsertP = rhs; + return *this; +#else + (void)rhs; + throw exception(make_error_code(errc::runtime), + "joint matrix is not supported on host."); +#endif // __SYCL_DEVICE_ONLY__ + } + wi_element &operator=(const wi_element &rhs) { #ifdef __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp index 349acae157ae7..0e2e72b41e929 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp @@ -85,26 +85,26 @@ extern "C" constexpr __spv::MatrixLayout joint_matrix_layout_to_spv( } } -template +template constexpr uint32_t CalculateMatrixOperand() { + uint32_t returnValue = 0x00; if constexpr (std::is_same::value && - std::is_same::value && - std::is_same::value) - return static_cast( + std::is_same::value) + returnValue += static_cast( __spv::MatrixOperands::MatrixAAndBBFloat16ComponentsINTEL); - if constexpr (std::is_signed::value && std::is_unsigned::value) - return static_cast( + if constexpr (std::is_same::value) + returnValue += static_cast( + __spv::MatrixOperands::MatrixCBFloat16ComponentsINTEL); + if constexpr (std::is_same::value) + returnValue += static_cast( + __spv::MatrixOperands::MatrixResultBFloat16ComponentsINTEL); + if constexpr (std::is_signed::value) + returnValue += static_cast( __spv::MatrixOperands::MatrixASignedComponentsKHR); - if constexpr (std::is_unsigned::value && std::is_signed::value) - return static_cast( + if constexpr (std::is_signed::value) + returnValue += static_cast( __spv::MatrixOperands::MatrixBSignedComponentsKHR); - if constexpr (std::is_signed::value && std::is_signed::value) { - return static_cast( - __spv::MatrixOperands::MatrixASignedComponentsKHR) + - static_cast( - __spv::MatrixOperands::MatrixBSignedComponentsKHR); - } - return 0; + return returnValue; } } // namespace detail diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index 3c1c8e3a84597..7aaac6c84bfe6 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -431,8 +431,7 @@ template (), sycl::detail::convertTypeToMatrixTypeString(), M, K, N)]] #endif // defined(__SYCL_DEVICE_ONLY__) -inline __SYCL_ALWAYS_INLINE void -joint_matrix_mad( +inline __SYCL_ALWAYS_INLINE void joint_matrix_mad( Group, joint_matrix &D, @@ -462,9 +461,9 @@ joint_matrix_mad( } #else constexpr uint32_t MatrixOperand = - sycl::detail::CalculateMatrixOperand(); - D.spvm = - __spirv_CooperativeMatrixMulAddKHR(A.spvm, B.spvm, C.spvm, MatrixOperand); + sycl::detail::CalculateMatrixOperand(); + D.spvm = __spirv_CooperativeMatrixMulAddKHR( + A.spvm, B.spvm, C.spvm, MatrixOperand); #endif // defined(__NVPTX__) #else std::ignore = A; @@ -489,10 +488,23 @@ void joint_matrix_copy( using storage_element_type = typename oneapi::detail::jm_type_interpretation_helper_trait< T2>::storage_element_type; + using src_storage_element_type = + typename oneapi::detail::jm_type_interpretation_helper_trait< + T1>::storage_element_type; + auto wi_data_c = sycl::ext::oneapi::detail::get_wi_data(sg, src); auto wi_data_dst = sycl::ext::oneapi::detail::get_wi_data(sg, dst); for (int i = 0; i < wi_data_c.length(); i++) { - wi_data_dst[i] = static_cast(wi_data_c[i]); + if constexpr (std::is_same_v) { + // Special case for SRC type sycl:half since we can't + // cast directly from wi_element(typed half) to other type. + // first cast is from wi_element to half (T1). + // second cast is from half to dst type (T2). + wi_data_dst[i] = static_cast( + static_cast(wi_data_c[i])); + } else { + wi_data_dst[i] = static_cast(wi_data_c[i]); + } } #endif // defined(__NVPTX__) #else diff --git a/sycl/test-e2e/Matrix/Inputs/common.hpp b/sycl/test-e2e/Matrix/Inputs/common.hpp index 73def8ead8bba..f87cbfb992505 100644 --- a/sycl/test-e2e/Matrix/Inputs/common.hpp +++ b/sycl/test-e2e/Matrix/Inputs/common.hpp @@ -67,7 +67,7 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K, for (unsigned int n = 0; n < N; n++) { int c_ind = transpose_c ? (n * M + m) : m * N + n; Tc acc = *(C + c_ind); - + float tmp = 0.f; for (unsigned int k = 0; k < K; k++) { int a_ind = colmajor_a ? (k * M + m) : m * K + k; int b_ind = colmajor_b ? (n * K + k) : k * N + n; @@ -80,6 +80,9 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K, acc += make_fp32(va[i]) * make_fp32(vb[i]); else if constexpr (std::is_same_v) acc += (float)va[i] * (float)vb[i]; + else if constexpr (std::is_same_v && + std::is_same_v) + tmp += (float)va[i] * (float)vb[i]; else if constexpr (std::is_same_v && std::is_same_v || std::is_integral_v && std::is_integral_v || @@ -92,6 +95,9 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K, assert(false && "Unsupported type in matrix_multiply_ref."); } } + if constexpr (std::is_same_v && + std::is_same_v) + acc += (bfloat16)tmp; if constexpr (!std::is_same_v) { lambda(acc); @@ -182,10 +188,11 @@ template bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { - if constexpr (!exact && (std::is_same_v || - std::is_same_v || - (std::is_same_v && - std::is_same_v))) { + if constexpr (!exact && + (std::is_same_v || + std::is_same_v || std::is_same_v || + (std::is_same_v && + std::is_same_v))) { float diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]); if (diff > FLOAT_EPSILON || std::isnan(src[i * cols + j])) { std::cerr << "Incorrect result in matrix. " diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_16bit_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_16bit_impl.hpp new file mode 100644 index 0000000000000..fdfffd5aa06b9 --- /dev/null +++ b/sycl/test-e2e/Matrix/Inputs/joint_matrix_16bit_impl.hpp @@ -0,0 +1,138 @@ +//===---joint_matrix_16bit_impl.hpp - DPC++ joint_matrix----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +template +class imatrix; + +template +void matrix_multiply(big_matrix &D, big_matrix &C, + big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((TAcc *)C.get_data(), range<2>(M, N)); + buffer bufD((TResult *)D.get_data(), range<2>(M, N)); + queue q; + size_t sg_size = + get_sg_size>(q); + + q.submit([&](handler &cgh) { + accessor accA{bufA, cgh}; + accessor accB{bufB, cgh}; + accessor accC{bufC, cgh}; + accessor accD{bufD, cgh}; + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[sycl::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix sub_b; + joint_matrix sub_c; + joint_matrix sub_d; + + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / VF) * (N * VF) + sg_starty / sg_size * TN * VF, + N * VF); + + joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c); + joint_matrix_copy(sg, sub_d, sub_c); + } + + joint_matrix_store( + sg, sub_d, + accD.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +template +void test() { + std::cout << "Testing: " << TM << " x " << TN << " x " << TK + << " [TM x TN x TK]" << std::endl; + + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + Tab A[MATRIX_M][MATRIX_K]; + Tab B[MATRIX_K / VF][MATRIX_N * VF]; + TAcc C[MATRIX_M][MATRIX_N]; + TResult D[MATRIX_M][MATRIX_N]; + TResult DRef[MATRIX_M][MATRIX_N]; + + matrix_rand(MATRIX_M, MATRIX_K, (Tab *)A, Tab(1)); + matrix_rand(MATRIX_K / VF, MATRIX_N * VF, (Tab *)B, Tab(1)); + + matrix_fill(MATRIX_M, MATRIX_N, (TAcc *)C, TAcc(1)); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1)); + matrix_fill(MATRIX_M, MATRIX_N, (TResult *)DRef, TResult(1)); + + big_matrix MC((TAcc *)&C); + big_matrix MD((TResult *)&D); + big_matrix MA((Tab *)&A); + big_matrix MB((Tab *)&B); + + matrix_multiply(MD, MC, MA, MB); + matrix_multiply_ref( + (Tab *)A, (Tab *)B, (TResult *)DRef, MATRIX_M, MATRIX_N, MATRIX_K / VF); + assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)D, (TResult *)DRef)); +} + +template +void test_combo() { + test(); + test(); + test(); + test(); +} + +template +void test_all() { + test_combo(); + test_combo(); + test_combo(); + test_combo(); + test_combo(); + test_combo(); +} diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_impl.hpp deleted file mode 100644 index 00e804cef2fb5..0000000000000 --- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_impl.hpp +++ /dev/null @@ -1,148 +0,0 @@ -//===---joint_matrix_bfloat16_impl.hpp - DPC++ joint_matrix----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -template class imatrix; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC((T1 *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - accessor accA{bufA, cgh}; - accessor accB{bufB, cgh}; - accessor accC{bufC, cgh}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { // - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, - N * 2); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - std::cout << "Testing: " << TM << " x " << TN << " x " << TK - << " [TM x TN x TK]" << std::endl; - - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - T A[MATRIX_M][MATRIX_K]; - T B[MATRIX_K / 2][MATRIX_N * 2]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (T *)A, - [](int i, int j) { return T(1) * (i + j); }); - matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (T *)B, - [](int i, int j) { return T(2) * i + T(3) * j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1)); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1)); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((T *)&A); - big_matrix MB((T *)&B); - matrix_multiply(MC, MA, - MB); - matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, MATRIX_M, - MATRIX_N, MATRIX_K / 2); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - // test(); - - // This combination is not currently supported for sub group size = 32 in - // IGC -#if (!defined(SG_SZ) || SG_SZ != 32) - test(); - // test(); - test(); - // test(); - test(); - // test(); - // test(); - // test(); - // test(); - // test(); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_packedB_impl.hpp deleted file mode 100644 index 85d33f2c83173..0000000000000 --- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_packedB_impl.hpp +++ /dev/null @@ -1,132 +0,0 @@ -//=----- joint_matrix_bfloat16_packedB_impl.hpp - DPC++ joint_matrix -------=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//=-------------------------------------------------------------------------=// - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size(q); - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { // - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - // Assuming B data is already in VNNI format. - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, - N * 2); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template int test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - bfloat16 A[MATRIX_M][MATRIX_K]; - bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; - float C[MATRIX_M][MATRIX_N]; - float D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (bfloat16 *)A, - [](int i, int j) { return 1.0f * (i + j); }); - matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (bfloat16 *)B, - [](int i, int j) { return 2.0f * i + 3.0f * j; }); - matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f); - matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f); - - big_matrix MC((float *)&C); - big_matrix MD((float *)&D); - big_matrix MA((bfloat16 *)&A); - big_matrix MB((bfloat16 *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref( - (bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, MATRIX_N, - MATRIX_K / 2); - - bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D); - std::cout << TM << "x" << TN << "x" << TK << " "; - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - int ret = 0; - for (auto &combination : combinations) { - if (combination.nsize == 0) { // Intel AMX - ret += test<16, 16, 16, class amx16x16x16>(); - break; - } - - if (combination.nsize == 16) { // architecture::intel_gpu_pvc - ret += test<16, 16, 16, class pvc16x16x16>(); - ret += test<32, 64, 16, class pvc32x64x16>(); - ret += test<1, 64, 16, class pvc1x64x16>(); - break; - } - } - - return ret; -} diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_half_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_half_impl.hpp deleted file mode 100644 index e51e7c30fa810..0000000000000 --- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_half_impl.hpp +++ /dev/null @@ -1,150 +0,0 @@ -//===---joint_matrix_half_impl.hpp - DPC++ joint_matrix--------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -template class mult; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, - big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - accessor accA{bufA, cgh}; - accessor accB{bufB, cgh}; - accessor accC{bufC, cgh}; - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup - // no code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / VNNI) * (N * VNNI) + - sg_starty / sg_size * TN * VNNI, - N * VNNI); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - }); // parallel for - }).wait(); -} - -template -void test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - T A[MATRIX_M][MATRIX_K]; - T B[MATRIX_K / VNNI][MATRIX_N * VNNI]; - TResult C[MATRIX_M][MATRIX_N]; - TResult D[MATRIX_M][MATRIX_N]; - - matrix_fill(MATRIX_M, MATRIX_K, (T *)A, - [](int i, int j) { return i + 2 * j; }); - matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B, - [](int i, int j) { return i + j; }); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1)); - matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1)); - - big_matrix MC((TResult *)&C); - big_matrix MD((TResult *)&D); - big_matrix MA((T *)&A); - big_matrix MB((T *)&B); - matrix_multiply( - MC, MA, MB); - matrix_multiply_ref((T *)A, (T *)B, (TResult *)D, - MATRIX_M, MATRIX_N, MATRIX_K / VNNI); - - assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D)); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].atype != matrix_type::fp16) - continue; - - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - // test(); - - // This combination is not currently supported for sub group size = 32 in - // IGC -#if (!defined(SG_SZ) || SG_SZ != 32) - // test(); - // test(); - // test(); - // test(); - // test(); - // test(); - // test(); - // test(); - // test(); - // test(); -#endif - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp index 5eab1046e6fd8..fb533762d91e4 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp @@ -5,19 +5,58 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 + // UNSUPPORTED: gpu-intel-dg2 +// UNSUPPORTED-INTENDED: SG size = 32 is not supported for SYCL Joint Matrix on +// DG2 // REQUIRES: target-spir - // REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} #include "common.hpp" #define SG_SZ 32 -#include "joint_matrix_bfloat16_impl.hpp" +#include "joint_matrix_16bit_impl.hpp" + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_accumulator.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_accumulator.cpp new file mode 100644 index 0000000000000..74502df173cd2 --- /dev/null +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_accumulator.cpp @@ -0,0 +1,39 @@ +//==- SG32/joint_matrix_bfloat16_accumulator.cpp - DPC++ joint_matrix -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: gpu-intel-dg2 +// UNSUPPORTED-INTENDED: SG size = 32 is not supported for SYCL Joint Matrix on +// DG2 +// UNSUPPORTED: cpu +// UNSUPPORTED-INTENDED: Different C and D types are not supported on AMX + +// REQUIRES: target-spir +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// XFAIL: gpu +// XFAIL-TRACKER: GSD-10112, GSD-4181 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} + +#include "common.hpp" + +#define SG_SZ 32 + +#include "joint_matrix_16bit_impl.hpp" + +int main() { + std::cout << "B row major:\n"; + test_all(); + std::cout << "B packed:\n"; + test_all(); +} diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp deleted file mode 100644 index c80b477599059..0000000000000 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//==----- joint_matrix_bfloat16_packedB.cpp - DPC++ joint_matrix----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: target-spir - -// REQUIRES: aspect-ext_intel_matrix -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 -// UNSUPPORTED: gpu-intel-dg2 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// XFAIL: gpu -// XFAIL-TRACKER: GSD-4181 - -#include "common.hpp" - -#define SG_SZ 32 -#include "joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp index 8f672fcb82978..df1eecdd66c88 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp @@ -1,12 +1,14 @@ -//==-------- joint_matrix_half.cpp - DPC++ joint_matrix------------ ----==// +//==------ SG32/joint_matrix_half.cpp - DPC++ joint_matrix--------- ----==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 + // UNSUPPORTED: gpu-intel-dg2 +// UNSUPPORTED-INTENDED: SG size = 32 is not supported for SYCL Joint Matrix on +// DG2 // REQUIRES: target-spir @@ -21,4 +23,39 @@ #define SG_SZ 32 -#include "joint_matrix_half_impl.hpp" +#include "joint_matrix_16bit_impl.hpp" + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_half_accumulator.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_half_accumulator.cpp new file mode 100644 index 0000000000000..28faa1f9485e3 --- /dev/null +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_half_accumulator.cpp @@ -0,0 +1,39 @@ +//==-------SG32/joint_matrix_half_accumulator.cpp - DPC++ joint_matrix ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: gpu-intel-dg2 +// UNSUPPORTED-INTENDED: SG size = 32 is not supported for SYCL Joint Matrix on +// DG2 +// UNSUPPORTED: cpu +// UNSUPPORTED-INTENDED: Different C and D types are not supported on AMX + +// REQUIRES: target-spir +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// XFAIL: gpu +// XFAIL-TRACKER: GSD-10112, GSD-4181 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} + +#include "common.hpp" + +#define SG_SZ 32 + +#include "joint_matrix_16bit_impl.hpp" + +int main() { + std::cout << "B row major:\n"; + test_all(); + std::cout << "B packed:\n"; + test_all(); +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp index d2acd9c81a715..61afad345c511 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp @@ -11,6 +11,45 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} #include "common.hpp" -#include "joint_matrix_bfloat16_impl.hpp" + +#include "joint_matrix_16bit_impl.hpp" + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_accumulator.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_accumulator.cpp new file mode 100644 index 0000000000000..7c82fd71cb6a1 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_accumulator.cpp @@ -0,0 +1,35 @@ +//==--- joint_matrix_bfloat16_accumulator.cpp - DPC++ joint_matrix-- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: cpu +// UNSUPPORTED-INTENDED: Different C and D types are not supported on AMX + +// REQUIRES: target-spir + +// REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 + +// XFAIL: gpu +// XFAIL-TRACKER: GSD-10112 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %} +// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} + +#include "common.hpp" + +#include "joint_matrix_16bit_impl.hpp" + +int main() { + std::cout << "B row major:\n"; + test_all(); + std::cout << "B packed:\n"; + test_all(); +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp index 8bcc38d271ec0..ab97297cbda1a 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp @@ -14,4 +14,39 @@ // RUN: %{run} %t.out #include "common.hpp" -#include "joint_matrix_half_impl.hpp" +#include "joint_matrix_16bit_impl.hpp" + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + test(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/joint_matrix_half_accumulator.cpp similarity index 58% rename from sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp rename to sycl/test-e2e/Matrix/joint_matrix_half_accumulator.cpp index 3c82f2fc78753..ff1bd30375a26 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_half_accumulator.cpp @@ -1,15 +1,23 @@ -//==----- joint_matrix_bfloat16_packedB.cpp - DPC++ joint_matrix----------==// +//==------ joint_matrix_half_accumulator.cpp - DPC++ joint_matrix----- ----==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// + +// UNSUPPORTED: cpu +// UNSUPPORTED-INTENDED: Different C and D types are not supported on AMX + // REQUIRES: target-spir +// REQUIRES: aspect-fp16 // REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27868, win: 101.5181 +// XFAIL: gpu +// XFAIL-TRACKER: GSD-10112 + // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %} @@ -17,4 +25,12 @@ // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %} #include "common.hpp" -#include "joint_matrix_bfloat16_packedB_impl.hpp" + +#include "joint_matrix_16bit_impl.hpp" + +int main() { + std::cout << "B row major:\n"; + test_all(); + std::cout << "B packed:\n"; + test_all(); +} diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp index 5f96ad7f8438b..f93e9c2c2f970 100644 --- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp +++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp @@ -54,7 +54,7 @@ // tests to match the required format and in that case you should just update // (i.e. reduce) the number and the list below. // -// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 232 +// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 229 // // List of improperly UNSUPPORTED tests. // Remove the CHECK once the test has been properly UNSUPPORTED. @@ -188,11 +188,8 @@ // CHECK-NEXT: Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp -// CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_array.cpp -// CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_packedB.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_down_convert.cpp -// CHECK-NEXT: Matrix/SG32/joint_matrix_half.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_prefetch.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp