diff --git a/sycl/include/sycl/__spirv/spirv_ops.hpp b/sycl/include/sycl/__spirv/spirv_ops.hpp
index b001bc914b196..ad0c7a31d3519 100644
--- a/sycl/include/sycl/__spirv/spirv_ops.hpp
+++ b/sycl/include/sycl/__spirv/spirv_ops.hpp
@@ -84,15 +84,15 @@ extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixStoreCheckedINTEL(
     std::size_t Stride, size_t Height, size_t Width, size_t CoordX,
     size_t CoordY, __spv::MatrixLayout Layout = L, int MemOperand = 0);
 
-template <typename TA, typename TB, typename TC, std::size_t M, std::size_t K,
-          std::size_t N, __spv::MatrixUse UA, __spv::MatrixUse UB,
-          __spv::MatrixUse UC,
+template <typename TA, typename TB, typename TC, typename TD, std::size_t M,
+          std::size_t K, std::size_t N, __spv::MatrixUse UA,
+          __spv::MatrixUse UB, __spv::MatrixUse UC,
           __spv::MatrixLayout LA = __spv::MatrixLayout::RowMajor,
           __spv::MatrixLayout LB = __spv::MatrixLayout::RowMajor,
           __spv::MatrixLayout LC = __spv::MatrixLayout::RowMajor,
           __spv::Scope::Flag S = __spv::Scope::Flag::Subgroup>
 extern __DPCPP_SYCL_EXTERNAL
-    __spv::__spirv_CooperativeMatrixKHR<TC, S, M, N, UC> *
+    __spv::__spirv_CooperativeMatrixKHR<TD, S, M, N, UC> *
     __spirv_CooperativeMatrixMulAddKHR(
         __spv::__spirv_CooperativeMatrixKHR<TA, S, M, K, UA> *A,
         __spv::__spirv_CooperativeMatrixKHR<TB, S, K, N, UB> *B,
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
index b8484a077c5fc..379ebaf1a1063 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
@@ -263,6 +263,25 @@ class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
 #endif // __SYCL_DEVICE_ONLY__
   }
 
+  operator float() {
+#ifdef __SYCL_DEVICE_ONLY__
+    sycl::ext::oneapi::bfloat16 *ExtractP =
+        __spirv_AccessChain<sycl::ext::oneapi::bfloat16,
+                            sycl::ext::oneapi::bfloat16, NumRows, NumCols,
+                            spv_matrix_use_traits<Use>::value,
+                            spv_scope_traits<Group>::value>(&M.spvm, idx);
+    union {
+      uint16_t intStorage;
+      sycl::ext::oneapi::bfloat16 floatValue;
+    };
+    floatValue = *ExtractP;
+    return __spirv_ConvertBF16ToFINTEL(intStorage);
+#else
+    throw exception(make_error_code(errc::runtime),
+                    "joint matrix is not supported on host.");
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
   explicit operator bool() {
 #ifdef __SYCL_DEVICE_ONLY__
     sycl::ext::oneapi::bfloat16 *ExtractP =
@@ -295,6 +314,21 @@ class wi_element<sycl::ext::oneapi::bfloat16, NumRows, NumCols, Use, Layout,
 #endif // __SYCL_DEVICE_ONLY__
   }
 
+  wi_element &operator=(const float &rhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    float *InsertP =
+        __spirv_AccessChain<float, float, NumRows, NumCols,
+                            spv_matrix_use_traits<Use>::value,
+                            spv_scope_traits<Group>::value>(&M.spvm, idx);
+    *InsertP = rhs;
+    return *this;
+#else
+    (void)rhs;
+    throw exception(make_error_code(errc::runtime),
+                    "joint matrix is not supported on host.");
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
   wi_element &operator=(const wi_element<sycl::ext::oneapi::bfloat16, NumRows,
                                          NumCols, Use, Layout, Group> &rhs) {
 #ifdef __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
index 349acae157ae7..0e2e72b41e929 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
@@ -85,26 +85,26 @@ extern "C" constexpr __spv::MatrixLayout joint_matrix_layout_to_spv(
   }
 }
 
-template<typename Ta, typename Tb, typename Tc>
+template <typename Ta, typename Tb, typename Tc, typename Td>
 constexpr uint32_t CalculateMatrixOperand() {
+  uint32_t returnValue = 0x00;
   if constexpr (std::is_same<Ta, sycl::ext::oneapi::bfloat16>::value &&
-                std::is_same<Tb, sycl::ext::oneapi::bfloat16>::value &&
-                std::is_same<Tc, float>::value)
-    return static_cast<uint32_t>(
+                std::is_same<Tb, sycl::ext::oneapi::bfloat16>::value)
+    returnValue += static_cast<uint32_t>(
         __spv::MatrixOperands::MatrixAAndBBFloat16ComponentsINTEL);
-  if constexpr (std::is_signed<Ta>::value && std::is_unsigned<Tb>::value)
-    return static_cast<uint32_t>(
+  if constexpr (std::is_same<Tc, sycl::ext::oneapi::bfloat16>::value)
+    returnValue += static_cast<uint32_t>(
+        __spv::MatrixOperands::MatrixCBFloat16ComponentsINTEL);
+  if constexpr (std::is_same<Td, sycl::ext::oneapi::bfloat16>::value)
+    returnValue += static_cast<uint32_t>(
+        __spv::MatrixOperands::MatrixResultBFloat16ComponentsINTEL);
+  if constexpr (std::is_signed<Ta>::value)
+    returnValue += static_cast<uint32_t>(
         __spv::MatrixOperands::MatrixASignedComponentsKHR);
-  if constexpr (std::is_unsigned<Ta>::value && std::is_signed<Tb>::value)
-    return static_cast<uint32_t>(
+  if constexpr (std::is_signed<Tb>::value)
+    returnValue += static_cast<uint32_t>(
         __spv::MatrixOperands::MatrixBSignedComponentsKHR);
-  if constexpr (std::is_signed<Ta>::value && std::is_signed<Tb>::value) {
-    return static_cast<uint32_t>(
-        __spv::MatrixOperands::MatrixASignedComponentsKHR) +
-           static_cast<uint32_t>(
-        __spv::MatrixOperands::MatrixBSignedComponentsKHR);
-  }
-  return 0;
+  return returnValue;
 }
 
 } // namespace detail
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
index 3c1c8e3a84597..7aaac6c84bfe6 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -431,8 +431,7 @@ template <typename Group, typename Ta, typename Tb, typename Tc, typename Td,
     sycl::detail::convertTypeToMatrixTypeString<Tc>(),
     sycl::detail::convertTypeToMatrixTypeString<Td>(), M, K, N)]]
 #endif // defined(__SYCL_DEVICE_ONLY__)
-inline __SYCL_ALWAYS_INLINE void
-joint_matrix_mad(
+inline __SYCL_ALWAYS_INLINE void joint_matrix_mad(
     Group,
     joint_matrix<Group, Td, use::accumulator, M, N,
                  sycl::ext::oneapi::experimental::matrix::layout::dynamic> &D,
@@ -462,9 +461,9 @@ joint_matrix_mad(
   }
 #else
   constexpr uint32_t MatrixOperand =
-      sycl::detail::CalculateMatrixOperand<Ta, Tb, Tc>();
-  D.spvm =
-      __spirv_CooperativeMatrixMulAddKHR(A.spvm, B.spvm, C.spvm, MatrixOperand);
+      sycl::detail::CalculateMatrixOperand<Ta, Tb, Tc, Td>();
+  D.spvm = __spirv_CooperativeMatrixMulAddKHR<Ta, Tb, Tc, Td>(
+      A.spvm, B.spvm, C.spvm, MatrixOperand);
 #endif // defined(__NVPTX__)
 #else
   std::ignore = A;
@@ -489,10 +488,23 @@ void joint_matrix_copy(
   using storage_element_type =
       typename oneapi::detail::jm_type_interpretation_helper_trait<
           T2>::storage_element_type;
+  using src_storage_element_type =
+      typename oneapi::detail::jm_type_interpretation_helper_trait<
+          T1>::storage_element_type;
+
   auto wi_data_c = sycl::ext::oneapi::detail::get_wi_data(sg, src);
   auto wi_data_dst = sycl::ext::oneapi::detail::get_wi_data(sg, dst);
   for (int i = 0; i < wi_data_c.length(); i++) {
-    wi_data_dst[i] = static_cast<storage_element_type>(wi_data_c[i]);
+    if constexpr (std::is_same_v<T1, sycl::half>) {
+      // Special case for SRC type sycl:half since we can't
+      // cast directly from wi_element(typed half) to other type.
+      // first cast is from wi_element to half (T1).
+      // second cast is from half to dst type (T2).
+      wi_data_dst[i] = static_cast<storage_element_type>(
+          static_cast<src_storage_element_type>(wi_data_c[i]));
+    } else {
+      wi_data_dst[i] = static_cast<storage_element_type>(wi_data_c[i]);
+    }
   }
 #endif // defined(__NVPTX__)
 #else
diff --git a/sycl/test-e2e/Matrix/Inputs/common.hpp b/sycl/test-e2e/Matrix/Inputs/common.hpp
index 73def8ead8bba..f87cbfb992505 100644
--- a/sycl/test-e2e/Matrix/Inputs/common.hpp
+++ b/sycl/test-e2e/Matrix/Inputs/common.hpp
@@ -67,7 +67,7 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K,
     for (unsigned int n = 0; n < N; n++) {
       int c_ind = transpose_c ? (n * M + m) : m * N + n;
       Tc acc = *(C + c_ind);
-
+      float tmp = 0.f;
       for (unsigned int k = 0; k < K; k++) {
         int a_ind = colmajor_a ? (k * M + m) : m * K + k;
         int b_ind = colmajor_b ? (n * K + k) : k * N + n;
@@ -80,6 +80,9 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K,
             acc += make_fp32(va[i]) * make_fp32(vb[i]);
           else if constexpr (std::is_same_v<Ta, sycl::half>)
             acc += (float)va[i] * (float)vb[i];
+          else if constexpr (std::is_same_v<Ta, bfloat16> &&
+                             std::is_same_v<Tc, bfloat16>)
+            tmp += (float)va[i] * (float)vb[i];
           else if constexpr (std::is_same_v<Ta, float> &&
                                  std::is_same_v<Tc, float> ||
                              std::is_integral_v<Ta> && std::is_integral_v<Tc> ||
@@ -92,6 +95,9 @@ void matrix_multiply_ref(Ta *A, Tb *B, Tc *C, int M, int N, int K,
             assert(false && "Unsupported type in matrix_multiply_ref.");
         }
       }
+      if constexpr (std::is_same_v<Ta, bfloat16> &&
+                    std::is_same_v<Tc, bfloat16>)
+        acc += (bfloat16)tmp;
 
       if constexpr (!std::is_same_v<F, std::nullptr_t>) {
         lambda(acc);
@@ -182,10 +188,11 @@ template <typename T1, typename T2, bool exact = false>
 bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) {
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
-      if constexpr (!exact && (std::is_same_v<T1, float> ||
-                               std::is_same_v<T1, bfloat16> ||
-                               (std::is_same_v<T1, double> &&
-                                std::is_same_v<T2, double>))) {
+      if constexpr (!exact &&
+                    (std::is_same_v<T1, float> ||
+                     std::is_same_v<T1, bfloat16> || std::is_same_v<T1, half> ||
+                     (std::is_same_v<T1, double> &&
+                      std::is_same_v<T2, double>))) {
         float diff = std::fabs(src[i * cols + j] - (T1)ref[i * cols + j]);
         if (diff > FLOAT_EPSILON || std::isnan(src[i * cols + j])) {
           std::cerr << "Incorrect result in matrix. "
diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_16bit_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_16bit_impl.hpp
new file mode 100644
index 0000000000000..fdfffd5aa06b9
--- /dev/null
+++ b/sycl/test-e2e/Matrix/Inputs/joint_matrix_16bit_impl.hpp
@@ -0,0 +1,138 @@
+//===---joint_matrix_16bit_impl.hpp - DPC++ joint_matrix----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+template <typename Tab, typename TAcc, typename TResult, size_t TM, size_t TN,
+          size_t TK, layout B_layout>
+class imatrix;
+
+template <typename Tab, typename TAcc, typename TResult, size_t M, size_t N,
+          size_t K, size_t TM, size_t TN, size_t TK, layout B_layout, size_t VF>
+void matrix_multiply(big_matrix<TResult, M, N> &D, big_matrix<TAcc, M, N> &C,
+                     big_matrix<Tab, M, K> &A,
+                     big_matrix<Tab, K / VF, N * VF> &B) {
+  size_t NDRangeM = M / TM;
+  size_t NDRangeN = N / TN;
+  buffer<Tab, 2> bufA(A.get_data(), range<2>(M, K));
+  buffer<Tab, 2> bufB(B.get_data(), range<2>(K, N));
+  buffer<TAcc, 2> bufC((TAcc *)C.get_data(), range<2>(M, N));
+  buffer<TResult, 2> bufD((TResult *)D.get_data(), range<2>(M, N));
+  queue q;
+  size_t sg_size =
+      get_sg_size<imatrix<Tab, TAcc, TResult, TM, TN, TK, B_layout>>(q);
+
+  q.submit([&](handler &cgh) {
+     accessor accA{bufA, cgh};
+     accessor accB{bufB, cgh};
+     accessor accC{bufC, cgh};
+     accessor accD{bufD, cgh};
+
+     cgh.parallel_for<imatrix<Tab, TAcc, TResult, TM, TN, TK, B_layout>>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[sycl::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           // The submatrix API has to be accessed by all the workitems in a
+           // subgroup these functions will be called once by the subgroup no
+           // code divergence between the workitems
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, Tab, use::a, TM, TK, layout::row_major>
+               sub_a;
+           joint_matrix<sub_group, Tab, use::b, TK, TN, B_layout> sub_b;
+           joint_matrix<sub_group, TAcc, use::accumulator, TM, TN> sub_c;
+           joint_matrix<sub_group, TResult, use::accumulator, TM, TN> sub_d;
+
+           joint_matrix_load(
+               sg, sub_c,
+               accC.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N, layout::row_major);
+
+           for (int k = 0; k < K / TK; k += 1) {
+             joint_matrix_load(
+                 sg, sub_a,
+                 accA.template get_multi_ptr<access::decorated::no>() +
+                     (sg_startx * TM) * K + k * TK,
+                 K);
+             joint_matrix_load(
+                 sg, sub_b,
+                 accB.template get_multi_ptr<access::decorated::no>() +
+                     (k * TK / VF) * (N * VF) + sg_starty / sg_size * TN * VF,
+                 N * VF);
+
+             joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c);
+             joint_matrix_copy(sg, sub_d, sub_c);
+           }
+
+           joint_matrix_store(
+               sg, sub_d,
+               accD.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N, layout::row_major);
+         }); // parallel for
+   }).wait();
+}
+
+template <typename Tab, typename TAcc, typename TResult, size_t TM, size_t TN,
+          size_t TK, layout B_layout, size_t VF>
+void test() {
+  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+            << " [TM x TN x TK]" << std::endl;
+
+  static constexpr size_t MATRIX_M = TM * 2;
+  static constexpr size_t MATRIX_N = TN * 2;
+  static constexpr size_t MATRIX_K = TK * 2;
+  Tab A[MATRIX_M][MATRIX_K];
+  Tab B[MATRIX_K / VF][MATRIX_N * VF];
+  TAcc C[MATRIX_M][MATRIX_N];
+  TResult D[MATRIX_M][MATRIX_N];
+  TResult DRef[MATRIX_M][MATRIX_N];
+
+  matrix_rand<Tab>(MATRIX_M, MATRIX_K, (Tab *)A, Tab(1));
+  matrix_rand<Tab>(MATRIX_K / VF, MATRIX_N * VF, (Tab *)B, Tab(1));
+
+  matrix_fill(MATRIX_M, MATRIX_N, (TAcc *)C, TAcc(1));
+  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1));
+  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)DRef, TResult(1));
+
+  big_matrix<TAcc, MATRIX_M, MATRIX_N> MC((TAcc *)&C);
+  big_matrix<TResult, MATRIX_M, MATRIX_N> MD((TResult *)&D);
+  big_matrix<Tab, MATRIX_M, MATRIX_K> MA((Tab *)&A);
+  big_matrix<Tab, MATRIX_K / VF, MATRIX_N * VF> MB((Tab *)&B);
+
+  matrix_multiply<Tab, TAcc, TResult, MATRIX_M, MATRIX_N, MATRIX_K, TM, TN, TK,
+                  B_layout, VF>(MD, MC, MA, MB);
+  matrix_multiply_ref<Tab, Tab, TResult, VF>(
+      (Tab *)A, (Tab *)B, (TResult *)DRef, MATRIX_M, MATRIX_N, MATRIX_K / VF);
+  assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)D, (TResult *)DRef));
+}
+
+template <typename TLow, typename THigh, size_t TM, size_t TN, size_t TK,
+          layout B_layout, size_t VF>
+void test_combo() {
+  test<TLow, TLow, THigh, TM, TN, TK, B_layout, VF>();
+  test<TLow, THigh, TLow, TM, TN, TK, B_layout, VF>();
+  test<TLow, TLow, TLow, TM, TN, TK, B_layout, VF>();
+  test<TLow, THigh, THigh, TM, TN, TK, B_layout, VF>();
+}
+
+template <typename TLow, typename THigh, layout B_layout, size_t VF>
+void test_all() {
+  test_combo<TLow, THigh, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, B_layout, VF>();
+  test_combo<TLow, THigh, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16, B_layout, VF>();
+  test_combo<TLow, THigh, /*TM*/ 1, /*TN*/ 64, /*TK*/ 16, B_layout, VF>();
+  test_combo<TLow, THigh, /*TM*/ 1, /*TN*/ 64, /*TK*/ 32, B_layout, VF>();
+  test_combo<TLow, THigh, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16, B_layout, VF>();
+  test_combo<TLow, THigh, /*TM*/ 32, /*TN*/ 64, /*TK*/ 32, B_layout, VF>();
+}
diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_impl.hpp
deleted file mode 100644
index 00e804cef2fb5..0000000000000
--- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_impl.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-//===---joint_matrix_bfloat16_impl.hpp - DPC++ joint_matrix----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-template <typename T, size_t TM, size_t TN, size_t TK> class imatrix;
-
-template <typename T1, typename T2, size_t M, size_t N, size_t K, size_t TM,
-          size_t TN, size_t TK>
-void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
-                     big_matrix<T2, K / 2, N * 2> &B) {
-  size_t NDRangeM = M / TM;
-  size_t NDRangeN = N / TN;
-  buffer<T2, 2> bufA(A.get_data(), range<2>(M, K));
-  buffer<T2, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<T1, 2> bufC((T1 *)C.get_data(), range<2>(M, N));
-
-  queue q;
-  size_t sg_size = get_sg_size<imatrix<T1, TM, TN, TK>>(q);
-  q.submit([&](handler &cgh) {
-     accessor accA{bufA, cgh};
-     accessor accB{bufB, cgh};
-     accessor accC{bufC, cgh};
-
-     cgh.parallel_for<imatrix<T1, TM, TN, TK>>(
-         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item)
-#ifdef SG_SZ
-             [[sycl::reqd_sub_group_size(SG_SZ)]]
-#endif
-         {
-           // The submatrix API has to be accessed by all the workitems in a
-           // subgroup these functions will be called once by the subgroup no
-           // code divergence between the workitems
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, T2, use::a, TM, TK, layout::row_major> sub_a;
-           // For B, we assume B has been already VNNIed.
-           joint_matrix<sub_group, T2, use::b, TK, TN, layout::ext_intel_packed>
-               sub_b;
-           joint_matrix<sub_group, T1, use::accumulator, TM, TN> sub_c;
-
-           joint_matrix_load(
-               sg, sub_c,
-               accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
-               N, layout::row_major);
-           for (int k = 0; k < K / TK; k += 1) { //
-             joint_matrix_load(
-                 sg, sub_a,
-                 accA.template get_multi_ptr<access::decorated::no>() +
-                     (sg_startx * TM) * K + k * TK,
-                 K);
-             joint_matrix_load(
-                 sg, sub_b,
-                 accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
-                 N * 2);
-             joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-           }
-           joint_matrix_store(
-               sg, sub_c,
-               accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
-               N, layout::row_major);
-         }); // parallel for
-   }).wait();
-}
-
-template <typename T, typename TResult, size_t TM, size_t TN, size_t TK>
-void test() {
-  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
-            << " [TM x TN x TK]" << std::endl;
-
-  static constexpr size_t MATRIX_M = TM * 2;
-  static constexpr size_t MATRIX_N = TN * 2;
-  static constexpr size_t MATRIX_K = TK * 2;
-  T A[MATRIX_M][MATRIX_K];
-  T B[MATRIX_K / 2][MATRIX_N * 2];
-  TResult C[MATRIX_M][MATRIX_N];
-  TResult D[MATRIX_M][MATRIX_N];
-
-  matrix_fill(MATRIX_M, MATRIX_K, (T *)A,
-              [](int i, int j) { return T(1) * (i + j); });
-  matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (T *)B,
-              [](int i, int j) { return T(2) * i + T(3) * j; });
-  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1));
-  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1));
-
-  big_matrix<TResult, MATRIX_M, MATRIX_N> MC((TResult *)&C);
-  big_matrix<TResult, MATRIX_M, MATRIX_N> MD((TResult *)&D);
-  big_matrix<T, MATRIX_M, MATRIX_K> MA((T *)&A);
-  big_matrix<T, MATRIX_K / 2, MATRIX_N * 2> MB((T *)&B);
-  matrix_multiply<TResult, T, MATRIX_M, MATRIX_N, MATRIX_K, TM, TN, TK>(MC, MA,
-                                                                        MB);
-  matrix_multiply_ref<T, T, TResult, 2>((T *)A, (T *)B, (TResult *)D, MATRIX_M,
-                                        MATRIX_N, MATRIX_K / 2);
-
-  assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D));
-}
-int main() {
-  queue q;
-  std::vector<combination> combinations =
-      q.get_device()
-          .get_info<sycl::ext::oneapi::experimental::info::device::
-                        matrix_combinations>();
-
-  for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].nsize == 0) { // Intel AMX
-      test<bfloat16, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16>();
-      break;
-    }
-
-    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      test<bfloat16, float, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16>();
-      // test<bfloat16, bfloat16, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16>();
-
-      // This combination is not currently supported for sub group size = 32 in
-      // IGC
-#if (!defined(SG_SZ) || SG_SZ != 32)
-      test<bfloat16, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16>();
-      // test<bfloat16, bfloat16, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16>();
-      test<bfloat16, float, /*TM*/ 1, /*TN*/ 64, /*TK*/ 16>();
-      // test<bfloat16, bfloat16, /*TM*/ 1, /*TN*/ 64, /*TK*/ 16>();
-      test<bfloat16, float, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16>();
-      // test<bfloat16, bfloat16, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16>();
-      // test<bfloat16, float, /*TM*/ 32, /*TN*/ 64, /*TK*/ 32>();
-      // test<bfloat16, bfloat16, /*TM*/ 32, /*TN*/ 64, /*TK*/ 32>();
-      // test<bfloat16, float, /*TM*/ 1, /*TN*/ 64, /*TK*/ 32>();
-      // test<bfloat16, bfloat16, /*TM*/ 1, /*TN*/ 64, /*TK*/ 32>();
-#endif
-      break;
-    }
-
-    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      test<bfloat16, float, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16>();
-      break;
-    }
-  }
-  return 0;
-}
diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_packedB_impl.hpp
deleted file mode 100644
index 85d33f2c83173..0000000000000
--- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_packedB_impl.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-//=----- joint_matrix_bfloat16_packedB_impl.hpp - DPC++ joint_matrix -------=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//=-------------------------------------------------------------------------=//
-
-template <size_t TM, size_t TN, size_t TK, class kernel_name, typename T1,
-          typename T2, size_t M, size_t N, size_t K>
-void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
-                     big_matrix<T2, K / 2, N * 2> &B) {
-  size_t NDRangeM = M / TM;
-  size_t NDRangeN = N / TN;
-  buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K));
-  buffer<bfloat16, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
-
-  queue q;
-  size_t sg_size = get_sg_size<kernel_name>(q);
-  q.submit([&](handler &cgh) {
-     auto accC = bufC.get_access<access::mode::read_write>(cgh);
-     auto accA = bufA.get_access<access::mode::read_write>(cgh);
-     auto accB = bufB.get_access<access::mode::read_write>(cgh);
-
-     cgh.parallel_for<kernel_name>(
-         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item)
-#ifdef SG_SZ
-             [[sycl::reqd_sub_group_size(SG_SZ)]]
-#endif
-         {
-           // The submatrix API has to be accessed by all the workitems in a
-           // subgroup these functions will be called once by the subgroup no
-           // code divergence between the workitems
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major>
-               sub_a;
-           // For B, we assume B has been already VNNIed.
-           joint_matrix<sub_group, bfloat16, use::b, TK, TN,
-                        layout::ext_intel_packed>
-               sub_b;
-           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
-
-           joint_matrix_load(
-               sg, sub_c,
-               accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
-               N, layout::row_major);
-           for (int k = 0; k < K / TK; k += 1) { //
-             joint_matrix_load(
-                 sg, sub_a,
-                 accA.template get_multi_ptr<access::decorated::no>() +
-                     (sg_startx * TM) * K + k * TK,
-                 K);
-             // Assuming B data is already in VNNI format.
-             joint_matrix_load(
-                 sg, sub_b,
-                 accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
-                 N * 2);
-             joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-           }
-           joint_matrix_store(
-               sg, sub_c,
-               accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
-               N, layout::row_major);
-         }); // parallel for
-   }).wait();
-}
-
-template <size_t TM, size_t TN, size_t TK, class kernel_name> int test() {
-  static constexpr size_t MATRIX_M = TM * 2;
-  static constexpr size_t MATRIX_N = TN * 2;
-  static constexpr size_t MATRIX_K = TK * 2;
-  bfloat16 A[MATRIX_M][MATRIX_K];
-  bfloat16 B[MATRIX_K / 2][MATRIX_N * 2];
-  float C[MATRIX_M][MATRIX_N];
-  float D[MATRIX_M][MATRIX_N];
-
-  matrix_fill(MATRIX_M, MATRIX_K, (bfloat16 *)A,
-              [](int i, int j) { return 1.0f * (i + j); });
-  matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (bfloat16 *)B,
-              [](int i, int j) { return 2.0f * i + 3.0f * j; });
-  matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f);
-  matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f);
-
-  big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
-  big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
-  big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A);
-  big_matrix<bfloat16, MATRIX_K / 2, MATRIX_N * 2> MB((bfloat16 *)&B);
-  matrix_multiply<TM, TN, TK, kernel_name>(MC, MA, MB);
-  matrix_multiply_ref<bfloat16, bfloat16, float, 2>(
-      (bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, MATRIX_N,
-      MATRIX_K / 2);
-
-  bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D);
-  std::cout << TM << "x" << TN << "x" << TK << " ";
-  std::cout << (res ? "passed" : "failed") << std::endl;
-  return !res;
-}
-
-int main() {
-  queue q;
-  std::vector<combination> combinations =
-      q.get_device()
-          .get_info<sycl::ext::oneapi::experimental::info::device::
-                        matrix_combinations>();
-
-  int ret = 0;
-  for (auto &combination : combinations) {
-    if (combination.nsize == 0) { // Intel AMX
-      ret += test<16, 16, 16, class amx16x16x16>();
-      break;
-    }
-
-    if (combination.nsize == 16) { // architecture::intel_gpu_pvc
-      ret += test<16, 16, 16, class pvc16x16x16>();
-      ret += test<32, 64, 16, class pvc32x64x16>();
-      ret += test<1, 64, 16, class pvc1x64x16>();
-      break;
-    }
-  }
-
-  return ret;
-}
diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_half_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_half_impl.hpp
deleted file mode 100644
index e51e7c30fa810..0000000000000
--- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_half_impl.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-//===---joint_matrix_half_impl.hpp - DPC++ joint_matrix--------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-template <typename T, size_t TM, size_t TN, size_t TK> class mult;
-
-template <typename TResult, typename T, size_t M, size_t N, size_t K, size_t TM,
-          size_t TN, size_t TK, size_t VNNI>
-void matrix_multiply(big_matrix<TResult, M, N> &C, big_matrix<T, M, K> &A,
-                     big_matrix<T, K / VNNI, N * VNNI> &B) {
-  size_t NDRangeM = M / TM;
-  size_t NDRangeN = N / TN;
-  buffer<T, 2> bufA(A.get_data(), range<2>(M, K));
-  buffer<T, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<TResult, 2> bufC(C.get_data(), range<2>(M, N));
-
-  queue q;
-  size_t sg_size = get_sg_size<mult<TResult, TM, TN, TK>>(q);
-  q.submit([&](handler &cgh) {
-     accessor accA{bufA, cgh};
-     accessor accB{bufB, cgh};
-     accessor accC{bufC, cgh};
-
-     cgh.parallel_for<mult<TResult, TM, TN, TK>>(
-         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, sg_size}),
-         [=](nd_item<2> spmd_item)
-#ifdef SG_SZ
-             [[sycl::reqd_sub_group_size(SG_SZ)]]
-#endif
-         {
-           // The submatrix API has to be accessed by all the workitems in a
-           // subgroup these functions will be called once by the subgroup
-           // no code divergence between the workitems
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;
-           // For B, we assume B has been already VNNIed.
-           joint_matrix<sub_group, T, use::b, TK, TN, layout::ext_intel_packed>
-               sub_b;
-           joint_matrix<sub_group, TResult, use::accumulator, TM, TN> sub_c;
-
-           joint_matrix_load(
-               sg, sub_c,
-               accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
-               N, layout::row_major);
-           for (int k = 0; k < K / TK; k += 1) {
-             joint_matrix_load(
-                 sg, sub_a,
-                 accA.template get_multi_ptr<access::decorated::no>() +
-                     (sg_startx * TM) * K + k * TK,
-                 K);
-             joint_matrix_load(
-                 sg, sub_b,
-                 accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / VNNI) * (N * VNNI) +
-                     sg_starty / sg_size * TN * VNNI,
-                 N * VNNI);
-             joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-           }
-           joint_matrix_store(
-               sg, sub_c,
-               accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
-               N, layout::row_major);
-         }); // parallel for
-   }).wait();
-}
-
-template <typename TResult, typename T, size_t VNNI, size_t TM, size_t TN,
-          size_t TK>
-void test() {
-  static constexpr size_t MATRIX_M = TM * 2;
-  static constexpr size_t MATRIX_N = TN * 2;
-  static constexpr size_t MATRIX_K = TK * 2;
-  T A[MATRIX_M][MATRIX_K];
-  T B[MATRIX_K / VNNI][MATRIX_N * VNNI];
-  TResult C[MATRIX_M][MATRIX_N];
-  TResult D[MATRIX_M][MATRIX_N];
-
-  matrix_fill(MATRIX_M, MATRIX_K, (T *)A,
-              [](int i, int j) { return i + 2 * j; });
-  matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B,
-              [](int i, int j) { return i + j; });
-  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1));
-  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1));
-
-  big_matrix<TResult, MATRIX_M, MATRIX_N> MC((TResult *)&C);
-  big_matrix<TResult, MATRIX_M, MATRIX_N> MD((TResult *)&D);
-  big_matrix<T, MATRIX_M, MATRIX_K> MA((T *)&A);
-  big_matrix<T, MATRIX_K / VNNI, MATRIX_N * VNNI> MB((T *)&B);
-  matrix_multiply<TResult, T, MATRIX_M, MATRIX_N, MATRIX_K, TM, TN, TK, VNNI>(
-      MC, MA, MB);
-  matrix_multiply_ref<T, T, TResult, VNNI>((T *)A, (T *)B, (TResult *)D,
-                                           MATRIX_M, MATRIX_N, MATRIX_K / VNNI);
-
-  assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D));
-}
-
-int main() {
-  queue q;
-  std::vector<combination> combinations =
-      q.get_device()
-          .get_info<sycl::ext::oneapi::experimental::info::device::
-                        matrix_combinations>();
-
-  for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].atype != matrix_type::fp16)
-      continue;
-
-    if (combinations[i].nsize == 0) { // Intel AMX
-      test<float, half, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32>();
-      break;
-    }
-
-    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      test<float, half, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16>();
-      // test<half, half, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16>();
-
-      // This combination is not currently supported for sub group size = 32 in
-      // IGC
-#if (!defined(SG_SZ) || SG_SZ != 32)
-      // test<float, half, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16>();
-      // test<half, half, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16>();
-      // test<float, half, /*TM*/ 1, /*TN*/ 64, /*TK*/ 16>();
-      // test<half, half, /*TM*/ 1, /*TN*/ 64, /*TK*/ 16>();
-      // test<float, half, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16>();
-      // test<half, half, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16>();
-      // test<float, half, /*TM*/ 1, /*TN*/ 64, /*TK*/ 32>();
-      // test<half, half, /*TM*/ 1, /*TN*/ 64, /*TK*/ 32>();
-      // test<float, half, /*TM*/ 32, /*TN*/ 64, /*TK*/ 32>();
-      // test<half, half, /*TM*/ 32, /*TN*/ 64, /*TK*/ 32>();
-#endif
-      break;
-    }
-
-    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      test<float, half, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16>();
-      break;
-    }
-  }
-  return 0;
-}
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
index 5eab1046e6fd8..fb533762d91e4 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
@@ -5,19 +5,58 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+
 // UNSUPPORTED: gpu-intel-dg2
+// UNSUPPORTED-INTENDED: SG size = 32 is not supported for SYCL Joint Matrix on
+// DG2
 
 // REQUIRES: target-spir
-
 // REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %}
 
 #include "common.hpp"
 
 #define SG_SZ 32
 
-#include "joint_matrix_bfloat16_impl.hpp"
+#include "joint_matrix_16bit_impl.hpp"
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<bfloat16, float, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<bfloat16, float, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<bfloat16, float, float, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<bfloat16, float, float, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<bfloat16, float, float, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<bfloat16, float, float, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_accumulator.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_accumulator.cpp
new file mode 100644
index 0000000000000..74502df173cd2
--- /dev/null
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_accumulator.cpp
@@ -0,0 +1,39 @@
+//==- SG32/joint_matrix_bfloat16_accumulator.cpp  - DPC++ joint_matrix -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: gpu-intel-dg2
+// UNSUPPORTED-INTENDED: SG size = 32 is not supported for SYCL Joint Matrix on
+// DG2
+// UNSUPPORTED: cpu
+// UNSUPPORTED-INTENDED: Different C and D types are not supported on AMX
+
+// REQUIRES: target-spir
+// REQUIRES: aspect-ext_intel_matrix
+// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
+
+// XFAIL: gpu
+// XFAIL-TRACKER: GSD-10112, GSD-4181
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %}
+
+#include "common.hpp"
+
+#define SG_SZ 32
+
+#include "joint_matrix_16bit_impl.hpp"
+
+int main() {
+  std::cout << "B row major:\n";
+  test_all<bfloat16, float, layout::row_major, (size_t)1>();
+  std::cout << "B packed:\n";
+  test_all<bfloat16, float, layout::ext_intel_packed, (size_t)2>();
+}
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp
deleted file mode 100644
index c80b477599059..0000000000000
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_packedB.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//==----- joint_matrix_bfloat16_packedB.cpp  - DPC++ joint_matrix----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: target-spir
-
-// REQUIRES: aspect-ext_intel_matrix
-// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
-
-// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
-// UNSUPPORTED: gpu-intel-dg2
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: gpu
-// XFAIL-TRACKER: GSD-4181
-
-#include "common.hpp"
-
-#define SG_SZ 32
-#include "joint_matrix_bfloat16_packedB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
index 8f672fcb82978..df1eecdd66c88 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
@@ -1,12 +1,14 @@
-//==-------- joint_matrix_half.cpp  - DPC++ joint_matrix------------ ----==//
+//==------ SG32/joint_matrix_half.cpp  - DPC++ joint_matrix--------- ----==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+
 // UNSUPPORTED: gpu-intel-dg2
+// UNSUPPORTED-INTENDED: SG size = 32 is not supported for SYCL Joint Matrix on
+// DG2
 
 // REQUIRES: target-spir
 
@@ -21,4 +23,39 @@
 
 #define SG_SZ 32
 
-#include "joint_matrix_half_impl.hpp"
+#include "joint_matrix_16bit_impl.hpp"
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<half, float, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<half, float, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<half, float, float, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<half, float, float, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<half, float, float, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, layout::row_major,
+           1>();
+      test<half, float, float, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_half_accumulator.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_half_accumulator.cpp
new file mode 100644
index 0000000000000..28faa1f9485e3
--- /dev/null
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_half_accumulator.cpp
@@ -0,0 +1,39 @@
+//==-------SG32/joint_matrix_half_accumulator.cpp  - DPC++ joint_matrix ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: gpu-intel-dg2
+// UNSUPPORTED-INTENDED: SG size = 32 is not supported for SYCL Joint Matrix on
+// DG2
+// UNSUPPORTED: cpu
+// UNSUPPORTED-INTENDED: Different C and D types are not supported on AMX
+
+// REQUIRES: target-spir
+// REQUIRES: aspect-ext_intel_matrix
+// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
+
+// XFAIL: gpu
+// XFAIL-TRACKER: GSD-10112, GSD-4181
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %}
+
+#include "common.hpp"
+
+#define SG_SZ 32
+
+#include "joint_matrix_16bit_impl.hpp"
+
+int main() {
+  std::cout << "B row major:\n";
+  test_all<half, float, layout::row_major, (size_t)1>();
+  std::cout << "B packed:\n";
+  test_all<half, float, layout::ext_intel_packed, (size_t)2>();
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
index d2acd9c81a715..61afad345c511 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
@@ -11,6 +11,45 @@
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %}
 
 #include "common.hpp"
-#include "joint_matrix_bfloat16_impl.hpp"
+
+#include "joint_matrix_16bit_impl.hpp"
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<bfloat16, float, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<bfloat16, float, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<bfloat16, float, float, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<bfloat16, float, float, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<bfloat16, float, float, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<bfloat16, float, float, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_accumulator.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_accumulator.cpp
new file mode 100644
index 0000000000000..7c82fd71cb6a1
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_accumulator.cpp
@@ -0,0 +1,35 @@
+//==--- joint_matrix_bfloat16_accumulator.cpp  - DPC++ joint_matrix-- ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: cpu
+// UNSUPPORTED-INTENDED: Different C and D types are not supported on AMX
+
+// REQUIRES: target-spir
+
+// REQUIRES: aspect-ext_intel_matrix
+// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
+
+// XFAIL: gpu
+// XFAIL-TRACKER: GSD-10112
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=1 %{run} %t.out %}
+// RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %}
+
+#include "common.hpp"
+
+#include "joint_matrix_16bit_impl.hpp"
+
+int main() {
+  std::cout << "B row major:\n";
+  test_all<bfloat16, float, layout::row_major, (size_t)1>();
+  std::cout << "B packed:\n";
+  test_all<bfloat16, float, layout::ext_intel_packed, (size_t)2>();
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
index 8bcc38d271ec0..ab97297cbda1a 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
@@ -14,4 +14,39 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-#include "joint_matrix_half_impl.hpp"
+#include "joint_matrix_16bit_impl.hpp"
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<half, float, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<half, float, float, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<half, float, float, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16,
+           layout::row_major, 1>();
+      test<half, float, float, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<half, float, float, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, layout::row_major,
+           1>();
+      test<half, float, float, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16,
+           layout::ext_intel_packed, 2>();
+      break;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/joint_matrix_half_accumulator.cpp
similarity index 58%
rename from sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp
rename to sycl/test-e2e/Matrix/joint_matrix_half_accumulator.cpp
index 3c82f2fc78753..ff1bd30375a26 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_half_accumulator.cpp
@@ -1,15 +1,23 @@
-//==----- joint_matrix_bfloat16_packedB.cpp  - DPC++ joint_matrix----------==//
+//==------ joint_matrix_half_accumulator.cpp  - DPC++ joint_matrix----- ----==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: cpu
+// UNSUPPORTED-INTENDED: Different C and D types are not supported on AMX
+
 // REQUIRES: target-spir
 
+// REQUIRES: aspect-fp16
 // REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27868, win: 101.5181
 
+// XFAIL: gpu
+// XFAIL-TRACKER: GSD-10112
+
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=2 %{run} %t.out %}
@@ -17,4 +25,12 @@
 // RUN: %if gpu %{ env IGC_JointMatrixLoadStoreOpt=0 %{run} %t.out %}
 
 #include "common.hpp"
-#include "joint_matrix_bfloat16_packedB_impl.hpp"
+
+#include "joint_matrix_16bit_impl.hpp"
+
+int main() {
+  std::cout << "B row major:\n";
+  test_all<half, float, layout::row_major, (size_t)1>();
+  std::cout << "B packed:\n";
+  test_all<half, float, layout::ext_intel_packed, (size_t)2>();
+}
diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
index 5f96ad7f8438b..f93e9c2c2f970 100644
--- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
+++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
@@ -54,7 +54,7 @@
 // tests to match the required format and in that case you should just update
 // (i.e. reduce) the number and the list below.
 //
-// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 232
+// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 229
 //
 // List of improperly UNSUPPORTED tests.
 // Remove the CHECK once the test has been properly UNSUPPORTED.
@@ -188,11 +188,8 @@
 // CHECK-NEXT: Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_array.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_bfloat16_packedB.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_down_convert.cpp
-// CHECK-NEXT: Matrix/SG32/joint_matrix_half.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_int8_rowmajorA_rowmajorB.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_prefetch.cpp
 // CHECK-NEXT: Matrix/SG32/joint_matrix_rowmajorA_rowmajorB.cpp