intel · sarnex · Dec 9, 2024 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
@@ -118,35 +118,39 @@ joint_matrix_apply(Group sg, joint_matrix<Group, T, Use, M, N, Layout> &jm,
   return;
 }
 
-template <typename Group, typename T, use Use, size_t M, size_t N,
+template <typename Group, typename T0, typename T1, use Use, size_t M, size_t N,
           layout Layout, typename F>
 inline __SYCL_ALWAYS_INLINE void
-joint_matrix_apply(Group sg, joint_matrix<Group, T, Use, M, N, Layout> &jmsrc,
-                   joint_matrix<Group, T, Use, M, N, Layout> &jmdest,
+joint_matrix_apply(Group sg, joint_matrix<Group, T0, Use, M, N, Layout> &jm0,
+                   joint_matrix<Group, T1, Use, M, N, Layout> &jm1,
                    F &&lambda) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__) || defined(__HIP_PLATFORM_AMD_MFMA__)
   std::ignore = sg;
-  for (int i = 0; i < jmsrc.matrix_impl.wi_marray.size(); i++) {
-    lambda(jmsrc.matrix_impl.wi_marray[i], jmdest.matrix_impl.wi_marray[i]);
+  for (int i = 0; i < jm0.matrix_impl.wi_marray.size(); i++) {
+    lambda(jm0.matrix_impl.wi_marray[i], jm1.matrix_impl.wi_marray[i]);
   }
 #else // NVPTX
-  using storage_element_type =
+  using storage_element_type0 =
       typename oneapi::detail::jm_type_interpretation_helper_trait<
-          T>::storage_element_type;
-  auto wi_data_c = sycl::ext::oneapi::detail::get_wi_data(sg, jmsrc);
-  auto wi_data_d = sycl::ext::oneapi::detail::get_wi_data(sg, jmdest);
-  for (int i = 0; i < wi_data_c.length(); i++) {
-    storage_element_type elementsrc = wi_data_c[i];
-    storage_element_type elementdest = wi_data_d[i];
-    lambda(elementsrc, elementdest);
-    wi_data_d[i] = elementdest;
+          T0>::storage_element_type;
+  using storage_element_type1 =
+      typename oneapi::detail::jm_type_interpretation_helper_trait<
+          T1>::storage_element_type;
+  auto wi_data_0 = sycl::ext::oneapi::detail::get_wi_data(sg, jm0);
+  auto wi_data_1 = sycl::ext::oneapi::detail::get_wi_data(sg, jm1);
+  for (int i = 0; i < wi_data_0.length(); i++) {
+    storage_element_type0 element0 = wi_data_0[i];
+    storage_element_type1 element1 = wi_data_1[i];
+    lambda(element0, element1);
+    wi_data_0[i] = element0;
+    wi_data_1[i] = element1;
   }
 #endif
 #else
   std::ignore = sg;
-  std::ignore = jmsrc;
-  std::ignore = jmdest;
+  std::ignore = jm0;
+  std::ignore = jm1;
   std::ignore = lambda;
   throw exception(make_error_code(errc::runtime),
                   "joint matrix is not supported on host.");

diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_activation.cpp
@@ -0,0 +1,24 @@
+//==---------- joint_matrix_activation.cpp  - DPC++ joint_matrix-----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix
+
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+
+// RUN: %{build} %fp-model-precise -o %t.out
+// RUN: %{run} %t.out
+
+// Currently, the outlining into an apply function triggers a bug in IGC
+// XFAIL: gpu
+// XFAIL-TRACKER: GSD-10373
+
+#include "../common.hpp"
+
+#define SG_SZ 32
+
+#include "../joint_matrix_activation_impl.hpp"
@@ -36,6 +36,12 @@ template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
   big_matrix(T *data) : mat(data) {}
 };
 
+enum class Activation {
+  ReLU,
+  Sigmoid,
+  None,
+};
+
 float make_fp32(bfloat16 x) {
   unsigned int y = *((int *)&x);
   y = y << 16;
@@ -156,15 +162,26 @@ void matrix_copy(unsigned int rows, unsigned int cols, T *src, T *dst) {
   }
 }
 
-template <typename T1, typename T2, bool exact = false>
+template <Activation act = Activation::None, typename T1, typename T2,
+          bool exact = false>
 bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) {
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
       if constexpr (!exact && (std::is_same_v<T1, float> ||
                                std::is_same_v<T1, bfloat16> ||
                                (std::is_same_v<T1, double> &&
                                 std::is_same_v<T2, double>))) {
-        float diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]);
+        float diff = 0;
+        if constexpr (act == Activation::None)
+          diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]);
+        else if constexpr (act == Activation::ReLU)
+          diff =
+              std::fabs(src[i * cols + j] -
+                        (T1)(sycl::max(static_cast<T2>(0), ref[i * cols + j])));
+        else if constexpr (act == Activation::Sigmoid)
+          diff = std::fabs(src[i * cols + j] -
+                           (T1)(1.0f / (1.0f + sycl::exp(-ref[i * cols + j]))));
+
         if (diff > FLOAT_EPSILON || std::isnan(src[i * cols + j])) {
           std::cout << "Incorrect result in matrix. "
                     << "i: " << i << ", j: " << j

diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation.cpp b/sycl/test-e2e/Matrix/joint_matrix_activation.cpp
@@ -0,0 +1,19 @@
+//==---------- joint_matrix_activation.cpp  - DPC++ joint_matrix------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: aspect-ext_intel_matrix
+// UNSUPPORTED: gpu-intel-dg2
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Currently, the outlining into an apply function triggers a bug in IGC
+// XFAIL: gpu
+// XFAIL-TRACKER: GSD-10373
+
+#include "common.hpp"
+#include "joint_matrix_activation_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_activation_impl.hpp
@@ -0,0 +1,120 @@
+//==-------- joint_matrix_down_convert_impl.hpp  - DPC++ joint_matrix-------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <iostream>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+constexpr size_t TM = 8;
+// TN and TK must be the same for this test.
+constexpr size_t TN = 16;
+constexpr size_t TK = 16;
+
+template <Activation act, size_t TM, size_t TK, typename Group, typename Tsrc,
+          typename Tdest, use UseSrc, use UseDest, layout LayoutSrc,
+          layout LayoutDest>
+void applyActivation(
+    Group &sg, joint_matrix<Group, Tsrc, UseSrc, TM, TK, LayoutSrc> &sub_c,
+    joint_matrix<Group, Tdest, UseDest, TM, TN, LayoutDest> &sub_a) {
+  if constexpr (act == Activation::None) {
+    joint_matrix_copy(sg, sub_c, sub_a);
+  } else if constexpr (act == Activation::ReLU) {
+
+    joint_matrix_apply(
+        sg, sub_c, [=](float &x) { x = sycl::max(static_cast<float>(0), x); });
+    joint_matrix_copy(sg, sub_c, sub_a);
+
+  } else if constexpr (act == Activation::Sigmoid) {
+    joint_matrix_apply(sg, sub_c,
+                       [=](float &x) { x = 1.0f / (1.0f + sycl::exp(-x)); });
+    joint_matrix_copy(sg, sub_c, sub_a);
+  }
+  return;
+}
+
+template <Activation act> class copy;
+
+template <Activation act, typename T1, typename T2, size_t M, size_t N,
+          size_t K>
+void matrix_activation_copy(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A) {
+  size_t NDRangeM = M / TM;
+  size_t NDRangeN = N / TN;
+  buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K));
+  buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
+
+  queue q;
+  size_t sg_size = get_sg_size<copy<act>>(q);
+  q.submit([&](handler &cgh) {
+     auto accC = bufC.get_access<access::mode::read_write>(cgh);
+     auto accA = bufA.get_access<access::mode::write>(cgh);
+
+     cgh.parallel_for<copy<act>>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[sycl::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           // The submatrix API has to be accessed by all the workitems in a
+           // subgroup these functions will be called once by the subgroup no
+           // code divergence between the workitems
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major>
+               sub_a;
+           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
+           joint_matrix_load(
+               sg, sub_c,
+               accC.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N, layout::row_major);
+           applyActivation<act>(sg, sub_c, sub_a);
+
+           ext::intel::experimental::matrix::joint_matrix_store(
+               sg, sub_a,
+               accA.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N);
+         }); // parallel for
+   }).wait();
+}
+
+int main() {
+  static constexpr size_t MATRIX_M = TM * 2;
+  static constexpr size_t MATRIX_N = TN * 2;
+  static constexpr size_t MATRIX_K = TK * 2;
+  bfloat16 A[MATRIX_M][MATRIX_K];
+  float C[MATRIX_M][MATRIX_N];
+
+  matrix_rand(MATRIX_M, MATRIX_N, *C, (float)5);
+
+  big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
+  big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A);
+
+  matrix_activation_copy<Activation::None>(MC, MA);
+  bool res0 = matrix_compare(MATRIX_M, MATRIX_N, (bfloat16 *)A, (float *)C);
+  bool res = matrix_compare<Activation::None>(MATRIX_M, MATRIX_N, (bfloat16 *)A,
+                                              (float *)C);
+  std::cout << (res ? "Copy passed" : "Copy failed") << std::endl;
+
+  matrix_activation_copy<Activation::ReLU>(MC, MA);
+  res &= matrix_compare<Activation::ReLU>(MATRIX_M, MATRIX_N, (bfloat16 *)A,
+                                          (float *)C);
+  std::cout << (res ? "ReLU passed" : "ReLU failed") << std::endl;
+
+  matrix_activation_copy<Activation::Sigmoid>(MC, MA);
+  res &= matrix_compare<Activation::Sigmoid>(MATRIX_M, MATRIX_N, (bfloat16 *)A,
+                                             (float *)C);
+  std::cout << (res ? "Sigmoid passed" : "Sigmoid failed") << std::endl;
+
+  return !res;
+}
@@ -8,19 +8,24 @@
 #include <sycl/usm.hpp>
 
 template <typename Tc, typename Ta, size_t M, size_t N>
-bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar) {
+bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref) {
   for (size_t i = 0; i < M; i++)
     for (size_t j = 0; j < N; j++) {
-      Tc diffc = D[i * N + j] - C[i * N + j] * 2;
-      Ta diffa = Ar[i * N + j] - (A[i * N + j] + 42);
+      Tc diffd = D[i * N + j] - Cref[i * N + j] * 2;
+      Tc diffc = C[i * N + j] - sycl::max(static_cast<Tc>(0), Cref[i * N + j]);
+      Ta diffar = Ar[i * N + j] - (Aref[i * N + j] + 42);
+      Ta diffa = A[i * N + j] - (Aref[i * N + j] + 5);
       if constexpr (std::is_same_v<Ta, bfloat16>) {
-        if (std::fabs(diffc) > FLOAT_EPSILON ||
+        if (std::fabs(diffd) > FLOAT_EPSILON ||
+            std::fabs(diffc) > FLOAT_EPSILON ||
+            std::fabs(diffar) > FLOAT_EPSILON ||
             std::fabs(diffa) > FLOAT_EPSILON || std::isnan(C[i * N + j]) ||
             std::isnan(A[i * N + j])) {
           return false;
         }
       } else {
-        if (std::abs(diffc) > 0 || std::abs(diffa) > 0) {
+        if (std::abs(diffd) > 0 || std::abs(diffc) > 0 ||
+            std::abs(diffar) > 0 || std::abs(diffa) > 0) {
           return false;
         }
       }
@@ -29,7 +34,8 @@ bool apply_verify(Tc *C, Tc *D, Ta *A, Ta *Ar) {
 }
 template <typename Tc, typename Ta, size_t TM, size_t TN, size_t TK, size_t M,
           size_t N, size_t K, class kernel_name>
-bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, queue q) {
+bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, Tc *Cref, Ta *Aref,
+                        queue q) {
   size_t NDRangeM = M / TM;
   size_t NDRangeN = N / TN;
 
@@ -70,22 +76,32 @@ bool apply_two_matrices(Tc *C, Tc *D, Ta *A, Ta *Ar, queue q) {
            joint_matrix_load(
                sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
-           joint_matrix_apply(sg, sub_c, sub_d,
-                              [](const Tc &x, Tc &y) { y = x * 2; });
+           joint_matrix_apply(sg, sub_c, sub_d, [](Tc &x, Tc &y) {
+             y = x * 2;
+             x = sycl::max(static_cast<Tc>(0), x);
+           });
            joint_matrix_store(
                sg, sub_d, pD + (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
+           joint_matrix_store(
+               sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N, layout::row_major);
            joint_matrix_load(
                sg, sub_a, pA + (sg_startx * TM) * K + sg_starty / sg_size * TK,
                K);
-           joint_matrix_apply(sg, sub_a, sub_ar,
-                              [](const Ta &x, Ta &y) { y = x + 42; });
+           joint_matrix_apply(sg, sub_a, sub_ar, [](Ta &x, Ta &y) {
+             y = x + 42;
+             x += 5;
+           });
            ext::intel::experimental::matrix::joint_matrix_store(
                sg, sub_ar,
                pAr + (sg_startx * TM) * K + sg_starty / sg_size * TK, K);
+           ext::intel::experimental::matrix::joint_matrix_store(
+               sg, sub_a, pA + (sg_startx * TM) * K + sg_starty / sg_size * TK,
+               K);
          }); // parallel for
    }).wait();
-  return apply_verify<Tc, Ta, M, N>(C, D, A, Ar);
+  return apply_verify<Tc, Ta, M, N>(C, D, A, Ar, Cref, Aref);
 }
 
 template <typename Ta, typename Tc, size_t TM, size_t TN, size_t TK,
@@ -96,16 +112,20 @@ bool test() {
   static constexpr size_t K = TK * 2;
   queue q;
 
+  Tc *Cref = malloc_shared<Tc>(M * N, q);
+  Ta *Aref = malloc_shared<Ta>(M * K, q);
   Tc *C = malloc_shared<Tc>(M * N, q);
   Tc *D = malloc_shared<Tc>(M * N, q);
   Ta *A = malloc_shared<Ta>(M * K, q);
   Ta *Ar = malloc_shared<Ta>(M * K, q);
 
-  matrix_rand(M, N, (Tc *)C, (Tc)100);
-  matrix_rand(M, K, (Ta *)A, (Ta)100);
+  matrix_rand(M, N, (Tc *)Cref, (Tc)100);
+  matrix_rand(M, K, (Ta *)Aref, (Ta)100);
+  matrix_copy(M, N, Cref, C);
+  matrix_copy(M, K, Aref, A);
 
   bool res = apply_two_matrices<Tc, Ta, TM, TN, TK, M, N, K, kernel_name>(
-      C, D, A, Ar, q);
+      C, D, A, Ar, Cref, Aref, q);
 
   if constexpr (std::is_same_v<Ta, bfloat16>)
     std::cout << "bfloat16 " << TM << "x" << TN << "x" << TK << ": "
@@ -117,6 +137,8 @@ bool test() {
   free(D, q);
   free(A, q);
   free(Ar, q);
+  free(Cref, q);
+  free(Aref, q);
 
   return res;
 }