Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 20 additions & 16 deletions sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,35 +118,39 @@ joint_matrix_apply(Group sg, joint_matrix<Group, T, Use, M, N, Layout> &jm,
return;
}

template <typename Group, typename T, use Use, size_t M, size_t N,
template <typename Group, typename T0, typename T1, use Use, size_t M, size_t N,
layout Layout, typename F>
inline __SYCL_ALWAYS_INLINE void
joint_matrix_apply(Group sg, joint_matrix<Group, T, Use, M, N, Layout> &jmsrc,
joint_matrix<Group, T, Use, M, N, Layout> &jmdest,
joint_matrix_apply(Group sg, joint_matrix<Group, T0, Use, M, N, Layout> &jm0,
joint_matrix<Group, T1, Use, M, N, Layout> &jm1,
F &&lambda) {
#if defined(__SYCL_DEVICE_ONLY__)
#if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__)
std::ignore = sg;
for (int i = 0; i < jmsrc.matrix_impl.wi_marray.size(); i++) {
lambda(jmsrc.matrix_impl.wi_marray[i], jmdest.matrix_impl.wi_marray[i]);
for (int i = 0; i < jm0.matrix_impl.wi_marray.size(); i++) {
lambda(jm0.matrix_impl.wi_marray[i], jm1.matrix_impl.wi_marray[i]);
}
#else // NVPTX
using storage_element_type =
using storage_element_type0 =
typename oneapi::detail::jm_type_interpretation_helper_trait<
T>::storage_element_type;
auto wi_data_c = sycl::ext::oneapi::detail::get_wi_data(sg, jmsrc);
auto wi_data_d = sycl::ext::oneapi::detail::get_wi_data(sg, jmdest);
for (int i = 0; i < wi_data_c.length(); i++) {
storage_element_type elementsrc = wi_data_c[i];
storage_element_type elementdest = wi_data_d[i];
lambda(elementsrc, elementdest);
wi_data_d[i] = elementdest;
T0>::storage_element_type;
using storage_element_type1 =
typename oneapi::detail::jm_type_interpretation_helper_trait<
T1>::storage_element_type;
auto wi_data_0 = sycl::ext::oneapi::detail::get_wi_data(sg, jm0);
auto wi_data_1 = sycl::ext::oneapi::detail::get_wi_data(sg, jm1);
for (int i = 0; i < wi_data_0.length(); i++) {
storage_element_type0 element0 = wi_data_0[i];
storage_element_type1 element1 = wi_data_1[i];
lambda(element0, element1);
wi_data_0[i] = element0;
wi_data_1[i] = element1;
}
#endif
#else
std::ignore = sg;
std::ignore = jmsrc;
std::ignore = jmdest;
std::ignore = jm0;
std::ignore = jm1;
std::ignore = lambda;
throw exception(make_error_code(errc::runtime),
"joint matrix is not supported on host.");
Expand Down
24 changes: 24 additions & 0 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//==---------- joint_matrix_activation.cpp - DPC++ joint_matrix-----------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix

// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
// UNSUPPORTED: gpu-intel-dg2

// RUN: %{build} %fp-model-precise -o %t.out
// RUN: %{run} %t.out

// Currently, the outlining into an apply function triggers a bug in IGC
// XFAIL: gpu
// XFAIL-TRACKER: GSD-10373

#include "../common.hpp"

#define SG_SZ 32

#include "../joint_matrix_activation_impl.hpp"
21 changes: 19 additions & 2 deletions sycl/test-e2e/Matrix/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
big_matrix(T *data) : mat(data) {}
};

enum class Activation {
ReLU,
Sigmoid,
None,
};

float make_fp32(bfloat16 x) {
unsigned int y = *((int *)&x);
y = y << 16;
Expand Down Expand Up @@ -156,15 +162,26 @@ void matrix_copy(unsigned int rows, unsigned int cols, T *src, T *dst) {
}
}

template <typename T1, typename T2, bool exact = false>
template <Activation act = Activation::None, typename T1, typename T2,
bool exact = false>
bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
if constexpr (!exact && (std::is_same_v<T1, float> ||
std::is_same_v<T1, bfloat16> ||
(std::is_same_v<T1, double> &&
std::is_same_v<T2, double>))) {
float diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]);
float diff = 0;
if constexpr (act == Activation::None)
diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]);
else if constexpr (act == Activation::ReLU)
diff =
std::fabs(src[i * cols + j] -
(T1)(sycl::max(static_cast<T2>(0), ref[i * cols + j])));
else if constexpr (act == Activation::Sigmoid)
diff = std::fabs(src[i * cols + j] -
(T1)(1.0f / (1.0f + sycl::exp(-ref[i * cols + j]))));

if (diff > FLOAT_EPSILON || std::isnan(src[i * cols + j])) {
std::cout << "Incorrect result in matrix. "
<< "i: " << i << ", j: " << j
Expand Down
19 changes: 19 additions & 0 deletions sycl/test-e2e/Matrix/joint_matrix_activation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//==---------- joint_matrix_activation.cpp - DPC++ joint_matrix------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-ext_intel_matrix
// UNSUPPORTED: gpu-intel-dg2

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// Currently, the outlining into an apply function triggers a bug in IGC
// XFAIL: gpu
// XFAIL-TRACKER: GSD-10373

#include "common.hpp"
#include "joint_matrix_activation_impl.hpp"
120 changes: 120 additions & 0 deletions sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
//==-------- joint_matrix_down_convert_impl.hpp - DPC++ joint_matrix-------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <iostream>

using namespace sycl;
using namespace sycl::ext::oneapi::experimental::matrix;

constexpr size_t TM = 8;
// TN and TK must be the same for this test.
constexpr size_t TN = 16;
constexpr size_t TK = 16;

template <Activation act, size_t TM, size_t TK, typename Group, typename Tsrc,
typename Tdest, use UseSrc, use UseDest, layout LayoutSrc,
layout LayoutDest>
void applyActivation(
Group &sg, joint_matrix<Group, Tsrc, UseSrc, TM, TK, LayoutSrc> &sub_c,
joint_matrix<Group, Tdest, UseDest, TM, TN, LayoutDest> &sub_a) {
if constexpr (act == Activation::None) {
joint_matrix_copy(sg, sub_c, sub_a);
} else if constexpr (act == Activation::ReLU) {

joint_matrix_apply(
sg, sub_c, [=](float &x) { x = sycl::max(static_cast<float>(0), x); });
joint_matrix_copy(sg, sub_c, sub_a);

} else if constexpr (act == Activation::Sigmoid) {
joint_matrix_apply(sg, sub_c,
[=](float &x) { x = 1.0f / (1.0f + sycl::exp(-x)); });
joint_matrix_copy(sg, sub_c, sub_a);
}
return;
}

template <Activation act> class copy;

template <Activation act, typename T1, typename T2, size_t M, size_t N,
size_t K>
void matrix_activation_copy(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A) {
size_t NDRangeM = M / TM;
size_t NDRangeN = N / TN;
buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K));
buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));

queue q;
size_t sg_size = get_sg_size<copy<act>>(q);
q.submit([&](handler &cgh) {
auto accC = bufC.get_access<access::mode::read_write>(cgh);
auto accA = bufA.get_access<access::mode::write>(cgh);

cgh.parallel_for<copy<act>>(
nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
[=](nd_item<2> spmd_item)
#ifdef SG_SZ
[[sycl::reqd_sub_group_size(SG_SZ)]]
#endif
{
// The submatrix API has to be accessed by all the workitems in a
// subgroup these functions will be called once by the subgroup no
// code divergence between the workitems
const auto global_idx = spmd_item.get_global_id(0);
const auto global_idy = spmd_item.get_global_id(1);
const auto sg_startx = global_idx - spmd_item.get_local_id(0);
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major>
sub_a;
joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
joint_matrix_load(
sg, sub_c,
accC.template get_multi_ptr<access::decorated::no>() +
(sg_startx * TM) * N + sg_starty / sg_size * TN,
N, layout::row_major);
applyActivation<act>(sg, sub_c, sub_a);

ext::intel::experimental::matrix::joint_matrix_store(
sg, sub_a,
accA.template get_multi_ptr<access::decorated::no>() +
(sg_startx * TM) * N + sg_starty / sg_size * TN,
N);
}); // parallel for
}).wait();
}

int main() {
static constexpr size_t MATRIX_M = TM * 2;
static constexpr size_t MATRIX_N = TN * 2;
static constexpr size_t MATRIX_K = TK * 2;
bfloat16 A[MATRIX_M][MATRIX_K];
float C[MATRIX_M][MATRIX_N];

matrix_rand(MATRIX_M, MATRIX_N, *C, (float)5);

big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A);

matrix_activation_copy<Activation::None>(MC, MA);
bool res0 = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C);
bool res = matrix_compare<Activation::None>(MATRIX_M, MATRIX_N, (bfloat16 *)A,
(float *)C);
std::cout << (res ? "Copy passed" : "Copy failed") << std::endl;

matrix_activation_copy<Activation::ReLU>(MC, MA);
res &= matrix_compare<Activation::ReLU>(MATRIX_M, MATRIX_N, (bfloat16 *)A,
(float *)C);
std::cout << (res ? "ReLU passed" : "ReLU failed") << std::endl;

matrix_activation_copy<Activation::Sigmoid>(MC, MA);
res &= matrix_compare<Activation::Sigmoid>(MATRIX_M, MATRIX_N, (bfloat16 *)A,
(float *)C);
std::cout << (res ? "Sigmoid passed" : "Sigmoid failed") << std::endl;

return !res;
}
50 changes: 36 additions & 14 deletions sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,24 @@
#include <sycl/usm.hpp>

template <typename Tc, typename Ta, size_t M, size_t N>
bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar) {
bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref) {
for (size_t i = 0; i < M; i++)
for (size_t j = 0; j < N; j++) {
Tc diffc = D[i * N + j] - C[i * N + j] * 2;
Ta diffa = Ar[i * N + j] - (A[i * N + j] + 42);
Tc diffd = D[i * N + j] - Cref[i * N + j] * 2;
Tc diffc = C[i * N + j] - sycl::max(static_cast<Tc>(0), Cref[i * N + j]);
Ta diffar = Ar[i * N + j] - (Aref[i * N + j] + 42);
Ta diffa = A[i * N + j] - (Aref[i * N + j] + 5);
if constexpr (std::is_same_v<Ta, bfloat16>) {
if (std::fabs(diffc) > FLOAT_EPSILON ||
if (std::fabs(diffd) > FLOAT_EPSILON ||
std::fabs(diffc) > FLOAT_EPSILON ||
std::fabs(diffar) > FLOAT_EPSILON ||
std::fabs(diffa) > FLOAT_EPSILON || std::isnan(C[i * N + j]) ||
std::isnan(A[i * N + j])) {
return false;
}
} else {
if (std::abs(diffc) > 0 || std::abs(diffa) > 0) {
if (std::abs(diffd) > 0 || std::abs(diffc) > 0 ||
std::abs(diffar) > 0 || std::abs(diffa) > 0) {
return false;
}
}
Expand All @@ -29,7 +34,8 @@ bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar) {
}
template <typename Tc, typename Ta, size_t TM, size_t TN, size_t TK, size_t M,
size_t N, size_t K, class kernel_name>
bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, queue q) {
bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref,
queue q) {
size_t NDRangeM = M / TM;
size_t NDRangeN = N / TN;

Expand Down Expand Up @@ -70,22 +76,32 @@ bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, queue q) {
joint_matrix_load(
sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN,
N, layout::row_major);
joint_matrix_apply(sg, sub_c, sub_d,
[](const Tc &x, Tc &y) { y = x * 2; });
joint_matrix_apply(sg, sub_c, sub_d, [](Tc &x, Tc &y) {
y = x * 2;
x = sycl::max(static_cast<Tc>(0), x);
});
joint_matrix_store(
sg, sub_d, pD + (sg_startx * TM) * N + sg_starty / sg_size * TN,
N, layout::row_major);
joint_matrix_store(
sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN,
N, layout::row_major);
joint_matrix_load(
sg, sub_a, pA + (sg_startx * TM) * K + sg_starty / sg_size * TK,
K);
joint_matrix_apply(sg, sub_a, sub_ar,
[](const Ta &x, Ta &y) { y = x + 42; });
joint_matrix_apply(sg, sub_a, sub_ar, [](Ta &x, Ta &y) {
y = x + 42;
x += 5;
});
ext::intel::experimental::matrix::joint_matrix_store(
sg, sub_ar,
pAr + (sg_startx * TM) * K + sg_starty / sg_size * TK, K);
ext::intel::experimental::matrix::joint_matrix_store(
sg, sub_a, pA + (sg_startx * TM) * K + sg_starty / sg_size * TK,
K);
}); // parallel for
}).wait();
return apply_verify<Tc, Ta, M, N>(C, D, A, Ar);
return apply_verify<Tc, Ta, M, N>(C, D, A, Ar, Cref, Aref);
}

template <typename Ta, typename Tc, size_t TM, size_t TN, size_t TK,
Expand All @@ -96,16 +112,20 @@ bool test() {
static constexpr size_t K = TK * 2;
queue q;

Tc *Cref = malloc_shared<Tc>(M * N, q);
Ta *Aref = malloc_shared<Ta>(M * K, q);
Tc *C = malloc_shared<Tc>(M * N, q);
Tc *D = malloc_shared<Tc>(M * N, q);
Ta *A = malloc_shared<Ta>(M * K, q);
Ta *Ar = malloc_shared<Ta>(M * K, q);

matrix_rand(M, N, (Tc *)C, (Tc)100);
matrix_rand(M, K, (Ta *)A, (Ta)100);
matrix_rand(M, N, (Tc *)Cref, (Tc)100);
matrix_rand(M, K, (Ta *)Aref, (Ta)100);
matrix_copy(M, N, Cref, C);
matrix_copy(M, K, Aref, A);

bool res = apply_two_matrices<Tc, Ta, TM, TN, TK, M, N, K, kernel_name>(
C, D, A, Ar, q);
C, D, A, Ar, Cref, Aref, q);

if constexpr (std::is_same_v<Ta, bfloat16>)
std::cout << "bfloat16 " << TM << "x" << TN << "x" << TK << ": "
Expand All @@ -117,6 +137,8 @@ bool test() {
free(D, q);
free(A, q);
free(Ar, q);
free(Cref, q);
free(Aref, q);

return res;
}
Expand Down
Loading