From 9ea6dbac626af1851abe8d35cf6480c7102292bd Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Thu, 21 Nov 2024 13:18:53 -0800 Subject: [PATCH 1/7] [SYCL][Joint Matrix] Update apply to make both matrices read/write --- .../sycl/ext/oneapi/matrix/matrix-unified.hpp | 34 ++--- .../Matrix/SG32/joint_matrix_activation.cpp | 23 ++++ sycl/test-e2e/Matrix/common.hpp | 21 ++- .../Matrix/joint_matrix_activation.cpp | 18 +++ .../Matrix/joint_matrix_activation_impl.hpp | 120 ++++++++++++++++++ .../joint_matrix_apply_two_matrices_impl.hpp | 48 +++++-- 6 files changed, 233 insertions(+), 31 deletions(-) create mode 100644 sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_activation.cpp create mode 100644 sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index 5f898101de031..8e4f5a71c6fe0 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -118,35 +118,39 @@ joint_matrix_apply(Group sg, joint_matrix &jm, return; } -template inline __SYCL_ALWAYS_INLINE void -joint_matrix_apply(Group sg, joint_matrix &jmsrc, - joint_matrix &jmdest, +joint_matrix_apply(Group sg, joint_matrix &jm0, + joint_matrix &jm1, F &&lambda) { #if defined(__SYCL_DEVICE_ONLY__) #if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__) std::ignore = sg; for (int i = 0; i < jmsrc.matrix_impl.wi_marray.size(); i++) { - lambda(jmsrc.matrix_impl.wi_marray[i], jmdest.matrix_impl.wi_marray[i]); + lambda(jm0.matrix_impl.wi_marray[i], jm1.matrix_impl.wi_marray[i]); } #else // NVPTX - using storage_element_type = + using storage_element_type0 = typename oneapi::detail::jm_type_interpretation_helper_trait< - T>::storage_element_type; - auto wi_data_c = sycl::ext::oneapi::detail::get_wi_data(sg, jmsrc); - auto wi_data_d = sycl::ext::oneapi::detail::get_wi_data(sg, jmdest); - for (int i = 0; i < wi_data_c.length(); i++) { - storage_element_type elementsrc = wi_data_c[i]; - storage_element_type elementdest = wi_data_d[i]; - lambda(elementsrc, elementdest); - wi_data_d[i] = elementdest; + T0>::storage_element_type; + using storage_element_type1 = + typename oneapi::detail::jm_type_interpretation_helper_trait< + T1>::storage_element_type; + auto wi_data_0 = sycl::ext::oneapi::detail::get_wi_data(sg, jm0); + auto wi_data_1 = sycl::ext::oneapi::detail::get_wi_data(sg, jm1); + for (int i = 0; i < wi_data_0.length(); i++) { + storage_element_type0 element0 = wi_data_0[i]; + storage_element_type1 element1 = wi_data_1[i]; + lambda(element0, element1); + wi_data_0[i] = element0; + wi_data_1[i] = element1; } #endif #else std::ignore = sg; - std::ignore = jmsrc; - std::ignore = jmdest; + std::ignore = jm0; + std::ignore = jm1; std::ignore = lambda; throw exception(make_error_code(errc::runtime), "joint matrix is not supported on host."); diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp new file mode 100644 index 0000000000000..67565e0202372 --- /dev/null +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp @@ -0,0 +1,23 @@ +//==---------- joint_matrix_activation.cpp - DPC++ joint_matrix-----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 + +// RUN: %{build} %fp-model-precise -o %t.out +// RUN: %{run} %t.out + +// Currently, the outlining into an apply function triggers a bug in IGC +// XFAIL: gpu + +#include "../common.hpp" + +#define SG_SZ 32 + +#include "../joint_matrix_activation_impl.hpp" diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp index 90f5508d97cf8..9ec12996b394f 100644 --- a/sycl/test-e2e/Matrix/common.hpp +++ b/sycl/test-e2e/Matrix/common.hpp @@ -36,6 +36,12 @@ template struct big_matrix { big_matrix(T *data) : mat(data) {} }; +enum class Activation { + ReLU, + Sigmoid, + None, +}; + float make_fp32(bfloat16 x) { unsigned int y = *((int *)&x); y = y << 16; @@ -156,7 +162,8 @@ void matrix_copy(unsigned int rows, unsigned int cols, T *src, T *dst) { } } -template +template bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { @@ -164,7 +171,17 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { std::is_same_v || (std::is_same_v && std::is_same_v))) { - float diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]); + float diff = 0; + if constexpr (act == Activation::None) + diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]); + else if constexpr (act == Activation::ReLU) + diff = + std::fabs(src[i * cols + j] - + (T1)(sycl::max(static_cast(0), ref[i * cols + j]))); + else if constexpr (act == Activation::Sigmoid) + diff = std::fabs(src[i * cols + j] - + (T1)(1.0f / (1.0f + sycl::exp(-ref[i * cols + j])))); + if (diff > FLOAT_EPSILON || std::isnan(src[i * cols + j])) { std::cout << "Incorrect result in matrix. " << "i: " << i << ", j: " << j diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/joint_matrix_activation.cpp new file mode 100644 index 0000000000000..42180043cd561 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_activation.cpp @@ -0,0 +1,18 @@ +//==---------- joint_matrix_activation.cpp - DPC++ joint_matrix------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix +// UNSUPPORTED: gpu-intel-dg2 + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Currently, the outlining into an apply function triggers a bug in IGC +// XFAIL: gpu + +#include "common.hpp" +#include "joint_matrix_activation_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp new file mode 100644 index 0000000000000..2ad712a244e52 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp @@ -0,0 +1,120 @@ +//==-------- joint_matrix_down_convert_impl.hpp - DPC++ joint_matrix-------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +constexpr size_t TM = 8; +// TN and TK must be the same for this test. +constexpr size_t TN = 16; +constexpr size_t TK = 16; + +template +void applyActivation( + Group &sg, joint_matrix &sub_c, + joint_matrix &sub_a) { + if constexpr (act == Activation::None) { + joint_matrix_copy(sg, sub_c, sub_a); + } else if constexpr (act == Activation::ReLU) { + + joint_matrix_apply( + sg, sub_c, [=](float &x) { x = sycl::max(static_cast(0), x); }); + joint_matrix_copy(sg, sub_c, sub_a); + + } else if constexpr (act == Activation::Sigmoid) { + joint_matrix_apply(sg, sub_c, + [=](float &x) { x = 1.0f / (1.0f + sycl::exp(-x)); }); + joint_matrix_copy(sg, sub_c, sub_a); + } + return; +} + +template class copy; + +template +void matrix_activation_copy(big_matrix &C, big_matrix &A) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + queue q; + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[sycl::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix sub_c; + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + applyActivation(sg, sub_c, sub_a); + + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N); + }); // parallel for + }).wait(); +} + +int main() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + static constexpr size_t MATRIX_K = TK * 2; + bfloat16 A[MATRIX_M][MATRIX_K]; + float C[MATRIX_M][MATRIX_N]; + + matrix_rand(MATRIX_M, MATRIX_N, *C, (float)5); + + big_matrix MC((float *)&C); + big_matrix MA((bfloat16 *)&A); + + matrix_activation_copy(MC, MA); + bool res0 = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); + bool res = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, + (float *)C); + std::cout << (res ? "Copy passed" : "failed") << std::endl; + + matrix_activation_copy(MC, MA); + res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, + (float *)C); + std::cout << (res ? "ReLU passed" : "failed") << std::endl; + + matrix_activation_copy(MC, MA); + res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, + (float *)C); + std::cout << (res ? "Sigmoid passed" : "failed") << std::endl; + + return !res; +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp index a88b0ca55416e..a1cd72f19af49 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp @@ -8,19 +8,24 @@ #include template -bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar) { +bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref) { for (size_t i = 0; i < M; i++) for (size_t j = 0; j < N; j++) { - Tc diffc = D[i * N + j] - C[i * N + j] * 2; - Ta diffa = Ar[i * N + j] - (A[i * N + j] + 42); + Tc diffd = D[i * N + j] - Cref[i * N + j] * 2; + Tc diffc = C[i * N + j] - sycl::max(static_cast(0), Cref[i * N + j]); + Ta diffar = Ar[i * N + j] - (Aref[i * N + j] + 42); + Ta diffa = A[i * N + j] - (Aref[i * N + j] + 5); if constexpr (std::is_same_v) { - if (std::fabs(diffc) > FLOAT_EPSILON || + if (std::fabs(diffd) > FLOAT_EPSILON || + std::fabs(diffc) > FLOAT_EPSILON || + std::fabs(diffar) > FLOAT_EPSILON || std::fabs(diffa) > FLOAT_EPSILON || std::isnan(C[i * N + j]) || std::isnan(A[i * N + j])) { return false; } } else { - if (std::abs(diffc) > 0 || std::abs(diffa) > 0) { + if (std::abs(diffd) > 0 || std::abs(diffc) > 0 || + std::abs(diffar) > 0 || std::abs(diffa) > 0) { return false; } } @@ -29,7 +34,8 @@ bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar) { } template -bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, queue q) { +bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref, + queue q) { size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; @@ -70,22 +76,32 @@ bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, queue q) { joint_matrix_load( sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); - joint_matrix_apply(sg, sub_c, sub_d, - [](const Tc &x, Tc &y) { y = x * 2; }); + joint_matrix_apply(sg, sub_c, sub_d, [](Tc &x, Tc &y) { + y = x * 2; + x = sycl::max(static_cast(0), x); + }); joint_matrix_store( sg, sub_d, pD + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); + joint_matrix_store( + sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); joint_matrix_load( sg, sub_a, pA + (sg_startx * TM) * K + sg_starty / sg_size * TK, K); - joint_matrix_apply(sg, sub_a, sub_ar, - [](const Ta &x, Ta &y) { y = x + 42; }); + joint_matrix_apply(sg, sub_a, sub_ar, [](Ta &x, Ta &y) { + y = x + 42; + x += 5; + }); ext::intel::experimental::matrix::joint_matrix_store( sg, sub_ar, pAr + (sg_startx * TM) * K + sg_starty / sg_size * TK, K); + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, pA + (sg_startx * TM) * K + sg_starty / sg_size * TK, + K); }); // parallel for }).wait(); - return apply_verify(C, D, A, Ar); + return apply_verify(C, D, A, Ar, Cref, Aref); } template (M * N, q); + Ta *Aref = malloc_shared(M * K, q); Tc *C = malloc_shared(M * N, q); Tc *D = malloc_shared(M * N, q); Ta *A = malloc_shared(M * K, q); Ta *Ar = malloc_shared(M * K, q); - matrix_rand(M, N, (Tc *)C, (Tc)100); - matrix_rand(M, K, (Ta *)A, (Ta)100); + matrix_rand(M, N, (Tc *)Cref, (Tc)100); + matrix_rand(M, K, (Ta *)Aref, (Ta)100); + matrix_copy(M, N, Cref, C); + matrix_copy(M, K, Aref, A); bool res = apply_two_matrices( - C, D, A, Ar, q); + C, D, A, Ar, Cref, Aref, q); if constexpr (std::is_same_v) std::cout << "bfloat16 " << TM << "x" << TN << "x" << TK << ": " From 68ea5851a1800bcbbe395cf4e630da46c3eede2d Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Thu, 21 Nov 2024 13:46:00 -0800 Subject: [PATCH 2/7] Add xfail trackers --- sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp | 1 + sycl/test-e2e/Matrix/joint_matrix_activation.cpp | 1 + sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp | 6 +++--- .../Matrix/joint_matrix_apply_two_matrices_impl.hpp | 4 +++- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index 8e4f5a71c6fe0..cab0b2f599575 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -127,7 +127,7 @@ joint_matrix_apply(Group sg, joint_matrix &jm0, #if defined(__SYCL_DEVICE_ONLY__) #if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__) std::ignore = sg; - for (int i = 0; i < jmsrc.matrix_impl.wi_marray.size(); i++) { + for (int i = 0; i < jm0.matrix_impl.wi_marray.size(); i++) { lambda(jm0.matrix_impl.wi_marray[i], jm1.matrix_impl.wi_marray[i]); } #else // NVPTX diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp index 67565e0202372..c08f6b3dc0b1d 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp @@ -15,6 +15,7 @@ // Currently, the outlining into an apply function triggers a bug in IGC // XFAIL: gpu +// XFAIL-TRACKER: GSD-10373 #include "../common.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/joint_matrix_activation.cpp index 42180043cd561..cd66132688bea 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_activation.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_activation.cpp @@ -13,6 +13,7 @@ // Currently, the outlining into an apply function triggers a bug in IGC // XFAIL: gpu +// XFAIL-TRACKER: GSD-10373 #include "common.hpp" #include "joint_matrix_activation_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp index 2ad712a244e52..7844bc9d35c95 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp @@ -104,17 +104,17 @@ int main() { bool res0 = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); bool res = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); - std::cout << (res ? "Copy passed" : "failed") << std::endl; + std::cout << (res ? "Copy passed" : "Copy failed") << std::endl; matrix_activation_copy(MC, MA); res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); - std::cout << (res ? "ReLU passed" : "failed") << std::endl; + std::cout << (res ? "ReLU passed" : "ReLU failed") << std::endl; matrix_activation_copy(MC, MA); res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); - std::cout << (res ? "Sigmoid passed" : "failed") << std::endl; + std::cout << (res ? "Sigmoid passed" : "Sigmoid failed") << std::endl; return !res; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp index a1cd72f19af49..eb38efbf752fe 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp @@ -137,7 +137,9 @@ bool test() { free(D, q); free(A, q); free(Ar, q); - + free(Cref, q); + free(Aref, q); + return res; } From 929ca93db1a4da96c2ccbdfbf0292408f258bac4 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Thu, 21 Nov 2024 13:52:41 -0800 Subject: [PATCH 3/7] Format --- sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp index eb38efbf752fe..58ff0672918e7 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp @@ -139,7 +139,7 @@ bool test() { free(Ar, q); free(Cref, q); free(Aref, q); - + return res; } From 14d4ef6e4d22db13b4131963bfc09cbc6b2a866c Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Tue, 26 Nov 2024 08:29:42 -0800 Subject: [PATCH 4/7] Address Yury's comments --- .../Matrix/SG32/joint_matrix_activation.cpp | 3 +- sycl/test-e2e/Matrix/common.hpp | 22 ++++---- .../Matrix/joint_matrix_activation.cpp | 1 - .../Matrix/joint_matrix_activation_impl.hpp | 21 +++++--- .../joint_matrix_apply_two_matrices_impl.hpp | 50 ++++++++----------- 5 files changed, 46 insertions(+), 51 deletions(-) diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp index c08f6b3dc0b1d..03e891cca28d0 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp @@ -7,8 +7,9 @@ //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix -// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 // UNSUPPORTED: gpu-intel-dg2 +// UNSUPPORTED-INTENDED: SG size = 32 is not currently supported for SYCL Joint +// Matrix by IGC on DG2 // RUN: %{build} %fp-model-precise -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp index 9ec12996b394f..4e98b6a9335a6 100644 --- a/sycl/test-e2e/Matrix/common.hpp +++ b/sycl/test-e2e/Matrix/common.hpp @@ -162,6 +162,13 @@ void matrix_copy(unsigned int rows, unsigned int cols, T *src, T *dst) { } } +template +void matrix_apply(unsigned int rows, unsigned int cols, T *mat, F op) { + for (unsigned int i = 0; i < rows; i++) + for (unsigned int j = 0; j < cols; j++) + mat[i * cols + j] = op(mat[i * cols + j]); +} + template bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { @@ -171,17 +178,7 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { std::is_same_v || (std::is_same_v && std::is_same_v))) { - float diff = 0; - if constexpr (act == Activation::None) - diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]); - else if constexpr (act == Activation::ReLU) - diff = - std::fabs(src[i * cols + j] - - (T1)(sycl::max(static_cast(0), ref[i * cols + j]))); - else if constexpr (act == Activation::Sigmoid) - diff = std::fabs(src[i * cols + j] - - (T1)(1.0f / (1.0f + sycl::exp(-ref[i * cols + j])))); - + float diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]); if (diff > FLOAT_EPSILON || std::isnan(src[i * cols + j])) { std::cout << "Incorrect result in matrix. " << "i: " << i << ", j: " << j @@ -190,7 +187,8 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { << ", Epsilon: " << FLOAT_EPSILON << "\n"; return false; } - } else if constexpr (exact || std::is_same_v) { + } else if constexpr (exact || std::is_same_v || + std::is_same_v) { if (src[i * cols + j] != ref[i * cols + j]) { std::cout << "Incorrect result in matrix." << "i: " << i << ", j: " << j diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/joint_matrix_activation.cpp index cd66132688bea..2a795ecedfbe4 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_activation.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_activation.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix -// UNSUPPORTED: gpu-intel-dg2 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp index 7844bc9d35c95..8779663d74733 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp @@ -15,6 +15,12 @@ constexpr size_t TM = 8; constexpr size_t TN = 16; constexpr size_t TK = 16; +template T ReLU(T x) { return sycl::max(static_cast(0), x); } + +template T Sigmoid(T x) { + return x = 1.0f / (1.0f + sycl::exp(-x)); +} + template @@ -25,13 +31,11 @@ void applyActivation( joint_matrix_copy(sg, sub_c, sub_a); } else if constexpr (act == Activation::ReLU) { - joint_matrix_apply( - sg, sub_c, [=](float &x) { x = sycl::max(static_cast(0), x); }); + joint_matrix_apply(sg, sub_c, [=](float &x) { x = ReLU(x); }); joint_matrix_copy(sg, sub_c, sub_a); } else if constexpr (act == Activation::Sigmoid) { - joint_matrix_apply(sg, sub_c, - [=](float &x) { x = 1.0f / (1.0f + sycl::exp(-x)); }); + joint_matrix_apply(sg, sub_c, [=](float &x) { x = Sigmoid(x); }); joint_matrix_copy(sg, sub_c, sub_a); } return; @@ -107,13 +111,14 @@ int main() { std::cout << (res ? "Copy passed" : "Copy failed") << std::endl; matrix_activation_copy(MC, MA); - res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, - (float *)C); + matrix_apply(MATRIX_M, MATRIX_N, (float *)C, [](float x) { return ReLU(x); }); + res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); std::cout << (res ? "ReLU passed" : "ReLU failed") << std::endl; matrix_activation_copy(MC, MA); - res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, - (float *)C); + matrix_apply(MATRIX_M, MATRIX_N, (float *)C, + [](float x) { return Sigmoid(x); }); + res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); std::cout << (res ? "Sigmoid passed" : "Sigmoid failed") << std::endl; return !res; diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp index 58ff0672918e7..11f0258d1d6b8 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp @@ -7,31 +7,22 @@ //===----------------------------------------------------------------------===// #include -template -bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref) { - for (size_t i = 0; i < M; i++) - for (size_t j = 0; j < N; j++) { - Tc diffd = D[i * N + j] - Cref[i * N + j] * 2; - Tc diffc = C[i * N + j] - sycl::max(static_cast(0), Cref[i * N + j]); - Ta diffar = Ar[i * N + j] - (Aref[i * N + j] + 42); - Ta diffa = A[i * N + j] - (Aref[i * N + j] + 5); - if constexpr (std::is_same_v) { - if (std::fabs(diffd) > FLOAT_EPSILON || - std::fabs(diffc) > FLOAT_EPSILON || - std::fabs(diffar) > FLOAT_EPSILON || - std::fabs(diffa) > FLOAT_EPSILON || std::isnan(C[i * N + j]) || - std::isnan(A[i * N + j])) { - return false; - } - } else { - if (std::abs(diffd) > 0 || std::abs(diffc) > 0 || - std::abs(diffar) > 0 || std::abs(diffa) > 0) { - return false; - } - } - } - return true; +template T mul2(T x) { return x * 2; } + +template T add5(T x) { return x + 5; } + +template +bool apply_verify(Tc *C, Tc *D, Tc *ref) { + Tc *refcopy = (Tc *)std::malloc(M * N * sizeof(Tc)); + memcpy(refcopy, ref, M * N * sizeof(Tc)); + matrix_apply(M, N, ref, [](Tc x) { return mul2(x); }); + bool res = matrix_compare(M, N, D, ref); + + matrix_apply(M, N, refcopy, [](Tc x) { return add5(x); }); + res &= matrix_compare(M, N, C, refcopy); + return res; } + template bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref, @@ -77,8 +68,8 @@ bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref, sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); joint_matrix_apply(sg, sub_c, sub_d, [](Tc &x, Tc &y) { - y = x * 2; - x = sycl::max(static_cast(0), x); + y = mul2(x); + x = add5(x); }); joint_matrix_store( sg, sub_d, pD + (sg_startx * TM) * N + sg_starty / sg_size * TN, @@ -90,8 +81,8 @@ bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref, sg, sub_a, pA + (sg_startx * TM) * K + sg_starty / sg_size * TK, K); joint_matrix_apply(sg, sub_a, sub_ar, [](Ta &x, Ta &y) { - y = x + 42; - x += 5; + y = mul2(x); + x = add5(x); }); ext::intel::experimental::matrix::joint_matrix_store( sg, sub_ar, @@ -101,7 +92,8 @@ bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref, K); }); // parallel for }).wait(); - return apply_verify(C, D, A, Ar, Cref, Aref); + return apply_verify(C, D, Cref) && + apply_verify(A, Ar, Aref); } template Date: Tue, 26 Nov 2024 08:39:40 -0800 Subject: [PATCH 5/7] move activations out of common.hpp --- sycl/test-e2e/Matrix/common.hpp | 9 +-------- sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp | 10 +++++++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp index 4e98b6a9335a6..1b133de98d580 100644 --- a/sycl/test-e2e/Matrix/common.hpp +++ b/sycl/test-e2e/Matrix/common.hpp @@ -36,12 +36,6 @@ template struct big_matrix { big_matrix(T *data) : mat(data) {} }; -enum class Activation { - ReLU, - Sigmoid, - None, -}; - float make_fp32(bfloat16 x) { unsigned int y = *((int *)&x); y = y << 16; @@ -169,8 +163,7 @@ void matrix_apply(unsigned int rows, unsigned int cols, T *mat, F op) { mat[i * cols + j] = op(mat[i * cols + j]); } -template +template bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp index 8779663d74733..a1a0cfd003f31 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp @@ -15,6 +15,12 @@ constexpr size_t TM = 8; constexpr size_t TN = 16; constexpr size_t TK = 16; +enum class Activation { + ReLU, + Sigmoid, + None, +}; + template T ReLU(T x) { return sycl::max(static_cast(0), x); } template T Sigmoid(T x) { @@ -105,9 +111,7 @@ int main() { big_matrix MA((bfloat16 *)&A); matrix_activation_copy(MC, MA); - bool res0 = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); - bool res = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, - (float *)C); + bool res = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); std::cout << (res ? "Copy passed" : "Copy failed") << std::endl; matrix_activation_copy(MC, MA); From c7e771210769801df3287aa79271ee1b7fe7a290 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Tue, 3 Dec 2024 14:00:31 -0800 Subject: [PATCH 6/7] further minor changes --- sycl/test-e2e/Matrix/common.hpp | 3 +-- sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/test-e2e/Matrix/common.hpp b/sycl/test-e2e/Matrix/common.hpp index 1b133de98d580..db184466649d5 100644 --- a/sycl/test-e2e/Matrix/common.hpp +++ b/sycl/test-e2e/Matrix/common.hpp @@ -180,8 +180,7 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { << ", Epsilon: " << FLOAT_EPSILON << "\n"; return false; } - } else if constexpr (exact || std::is_same_v || - std::is_same_v) { + } else if constexpr (exact || std::is_integral_v) { if (src[i * cols + j] != ref[i * cols + j]) { std::cout << "Incorrect result in matrix." << "i: " << i << ", j: " << j diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp index 11f0258d1d6b8..e8fdf866e641a 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp @@ -15,10 +15,10 @@ template bool apply_verify(Tc *C, Tc *D, Tc *ref) { Tc *refcopy = (Tc *)std::malloc(M * N * sizeof(Tc)); memcpy(refcopy, ref, M * N * sizeof(Tc)); - matrix_apply(M, N, ref, [](Tc x) { return mul2(x); }); + matrix_apply(M, N, ref, mul2); bool res = matrix_compare(M, N, D, ref); - matrix_apply(M, N, refcopy, [](Tc x) { return add5(x); }); + matrix_apply(M, N, refcopy, add5); res &= matrix_compare(M, N, C, refcopy); return res; } From 58d5fa23b0ad20946489eaefd349afd2bfee5a36 Mon Sep 17 00:00:00 2001 From: Dounia Khaldi Date: Mon, 9 Dec 2024 06:21:23 -0800 Subject: [PATCH 7/7] Remove the test to move forward with this PR --- .../Matrix/SG32/joint_matrix_activation.cpp | 25 ---- .../Matrix/joint_matrix_activation.cpp | 18 --- .../Matrix/joint_matrix_activation_impl.hpp | 129 ------------------ 3 files changed, 172 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_activation.cpp delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp deleted file mode 100644 index 03e891cca28d0..0000000000000 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//==---------- joint_matrix_activation.cpp - DPC++ joint_matrix-----------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix - -// UNSUPPORTED: gpu-intel-dg2 -// UNSUPPORTED-INTENDED: SG size = 32 is not currently supported for SYCL Joint -// Matrix by IGC on DG2 - -// RUN: %{build} %fp-model-precise -o %t.out -// RUN: %{run} %t.out - -// Currently, the outlining into an apply function triggers a bug in IGC -// XFAIL: gpu -// XFAIL-TRACKER: GSD-10373 - -#include "../common.hpp" - -#define SG_SZ 32 - -#include "../joint_matrix_activation_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/joint_matrix_activation.cpp deleted file mode 100644 index 2a795ecedfbe4..0000000000000 --- a/sycl/test-e2e/Matrix/joint_matrix_activation.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//==---------- joint_matrix_activation.cpp - DPC++ joint_matrix------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// Currently, the outlining into an apply function triggers a bug in IGC -// XFAIL: gpu -// XFAIL-TRACKER: GSD-10373 - -#include "common.hpp" -#include "joint_matrix_activation_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp deleted file mode 100644 index a1a0cfd003f31..0000000000000 --- a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp +++ /dev/null @@ -1,129 +0,0 @@ -//==-------- joint_matrix_down_convert_impl.hpp - DPC++ joint_matrix-------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -constexpr size_t TM = 8; -// TN and TK must be the same for this test. -constexpr size_t TN = 16; -constexpr size_t TK = 16; - -enum class Activation { - ReLU, - Sigmoid, - None, -}; - -template T ReLU(T x) { return sycl::max(static_cast(0), x); } - -template T Sigmoid(T x) { - return x = 1.0f / (1.0f + sycl::exp(-x)); -} - -template -void applyActivation( - Group &sg, joint_matrix &sub_c, - joint_matrix &sub_a) { - if constexpr (act == Activation::None) { - joint_matrix_copy(sg, sub_c, sub_a); - } else if constexpr (act == Activation::ReLU) { - - joint_matrix_apply(sg, sub_c, [=](float &x) { x = ReLU(x); }); - joint_matrix_copy(sg, sub_c, sub_a); - - } else if constexpr (act == Activation::Sigmoid) { - joint_matrix_apply(sg, sub_c, [=](float &x) { x = Sigmoid(x); }); - joint_matrix_copy(sg, sub_c, sub_a); - } - return; -} - -template class copy; - -template -void matrix_activation_copy(big_matrix &C, big_matrix &A) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufC((float *)C.get_data(), range<2>(M, N)); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix sub_c; - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); - applyActivation(sg, sub_c, sub_a); - - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, - N); - }); // parallel for - }).wait(); -} - -int main() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - static constexpr size_t MATRIX_K = TK * 2; - bfloat16 A[MATRIX_M][MATRIX_K]; - float C[MATRIX_M][MATRIX_N]; - - matrix_rand(MATRIX_M, MATRIX_N, *C, (float)5); - - big_matrix MC((float *)&C); - big_matrix MA((bfloat16 *)&A); - - matrix_activation_copy(MC, MA); - bool res = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); - std::cout << (res ? "Copy passed" : "Copy failed") << std::endl; - - matrix_activation_copy(MC, MA); - matrix_apply(MATRIX_M, MATRIX_N, (float *)C, [](float x) { return ReLU(x); }); - res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); - std::cout << (res ? "ReLU passed" : "ReLU failed") << std::endl; - - matrix_activation_copy(MC, MA); - matrix_apply(MATRIX_M, MATRIX_N, (float *)C, - [](float x) { return Sigmoid(x); }); - res &= matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C); - std::cout << (res ? "Sigmoid passed" : "Sigmoid failed") << std::endl; - - return !res; -}