diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 481ac16ec4f26..0a61410b07d18 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -311,6 +311,27 @@ extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixPrefetchINTEL(
     T *Ptr, uint32_t NumRows, uint32_t NumCols, unsigned int CacheLevel,
     __spv::MatrixLayout Layout, size_t Stride);
 
+template <typename T, typename Tp, std::size_t R, std::size_t C,
+          __spv::MatrixUse U,
+          __spv::MatrixLayout L = __spv::MatrixLayout::RowMajor,
+          __spv::Scope::Flag S = __spv::Scope::Flag::Subgroup>
+extern __DPCPP_SYCL_EXTERNAL
+    __spv::__spirv_JointMatrixINTEL<Tp, R, C, L, S, U> *
+    __spirv_CooperativeMatrixLoadOffsetINTEL(T *Ptr, int32_t RowIndex,
+                                             int32_t ColIndex,
+                                             __spv::MatrixLayout Layout = L,
+                                             std::size_t Stride = 0,
+                                             int MemOperand = 0);
+
+template <typename T, typename Tp, std::size_t R, std::size_t C,
+          __spv::MatrixUse U,
+          __spv::MatrixLayout L = __spv::MatrixLayout::RowMajor,
+          __spv::Scope::Flag S = __spv::Scope::Flag::Subgroup>
+extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixStoreOffsetINTEL(
+    T *Ptr, int32_t RowIndex, int32_t ColIndex,
+    __spv::__spirv_JointMatrixINTEL<Tp, R, C, L, S, U> *Object,
+    __spv::MatrixLayout Layout = L, std::size_t Stride = 0, int MemOperand = 0);
+
 #ifndef __SPIRV_BUILTIN_DECLARATIONS__
 #error                                                                         \
     "SPIR-V built-ins are not available. Please set -fdeclare-spirv-builtins flag."
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
index a3749a0137e78..1e5a573ec2c3f 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
@@ -1120,6 +1120,101 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store_checked(
 }
 // End out-of-bounds API
 
+template <
+    typename Group, typename T, typename Tp,
+    sycl::ext::oneapi::experimental::matrix::use Use, size_t NumRows,
+    size_t NumCols, sycl::ext::oneapi::experimental::matrix::layout Layout,
+    access::address_space Space, access::decorated IsDecorated,
+    std::enable_if_t<Use == sycl::ext::oneapi::experimental::matrix::use::a ||
+                         Use == sycl::ext::oneapi::experimental::matrix::use::b,
+                     bool> = true>
+inline __SYCL_ALWAYS_INLINE void
+joint_matrix_store(Group,
+                   const sycl::ext::oneapi::experimental::matrix::joint_matrix<
+                       Group, Tp, Use, NumRows, NumCols, Layout> &Src,
+                   size_t RowIndex, size_t ColIndex,
+                   multi_ptr<T, Space, IsDecorated> BaseDst, size_t Stride) {
+#if defined(__SYCL_DEVICE_ONLY__)
+  static_assert(Space != access::address_space::private_space,
+                "Joint Matrix doesn't support store to private memory!");
+#if defined(__NVPTX__)
+  std::ignore = Src;
+  std::ignore = BaseDst;
+  std::ignore = Stride;
+  throw exception(
+      make_error_code(errc::runtime),
+      "This version of the matrix extension is only currently supported on "
+      "intel devices");
+#else
+  // intel's impl
+  using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
+  DecorT *Ptr = sycl::detail::getDecorated<DecorT>(BaseDst);
+  __spirv_CooperativeMatrixStoreOffsetINTEL<
+      DecorT, Tp, NumRows, NumCols,
+      sycl::ext::oneapi::experimental::matrix::spv_matrix_use_traits<
+          Use>::value,
+      sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits<
+          Layout>::value>(
+      Ptr, RowIndex, ColIndex, Src.spvm,
+      sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits<
+          Layout>::value,
+      Stride);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = Src;
+  std::ignore = BaseDst;
+  std::ignore = Stride;
+  throw exception(make_error_code(errc::runtime),
+                  "joint matrix is not supported on host.");
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+template <
+    typename Group, typename T, typename Tp,
+    sycl::ext::oneapi::experimental::matrix::use Use, size_t NumRows,
+    size_t NumCols, sycl::ext::oneapi::experimental::matrix::layout Layout,
+    typename PropertyListT,
+    std::enable_if_t<Use == sycl::ext::oneapi::experimental::matrix::use::a ||
+                         Use == sycl::ext::oneapi::experimental::matrix::use::b,
+                     bool> = true>
+inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
+    Group,
+    const sycl::ext::oneapi::experimental::matrix::joint_matrix<
+        Group, Tp, Use, NumRows, NumCols, Layout>
+        Src,
+    ext::oneapi::experimental::annotated_ptr<T, PropertyListT> BaseDst,
+    size_t RowIndex, size_t ColIndex, size_t Stride) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = Src;
+  std::ignore = BaseDst;
+  std::ignore = Stride;
+  throw exception(
+      make_error_code(errc::runtime),
+      "This version of the matrix extension is only currently supported on "
+      "intel devices");
+#else
+  // intel's impl
+  T *Ptr = BaseDst.get();
+  __spirv_CooperativeMatrixStoreOffsetINTEL<
+      T, Tp, NumRows, NumCols,
+      sycl::ext::oneapi::experimental::matrix::spv_matrix_use_traits<
+          Use>::value,
+      sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits<
+          Layout>::value>(
+      Ptr, Src.spvm,
+      sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits<
+          Layout>::value,
+      RowIndex, ColIndex, Stride);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = Src;
+  std::ignore = BaseDst;
+  std::ignore = Stride;
+  throw exception(make_error_code(errc::runtime),
+                  "joint matrix is not supported on host.");
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
 } // namespace intel::experimental::matrix
 
 } // namespace ext
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
index d3d57f24c56e6..ff49f4d2ad067 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -377,6 +377,181 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
 #endif // defined(__SYCL_DEVICE_ONLY__)
 }
 
+// Begin offset load overloads
+template <
+    typename Group, typename S, typename T, size_t NumRows, size_t NumCols,
+    access::address_space Space, access::decorated IsDecorated,
+    std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value, bool> =
+        true>
+inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
+    Group Sg,
+    joint_matrix<Group, S, use::accumulator, NumRows, NumCols,
+                 sycl::ext::oneapi::experimental::matrix::layout::dynamic> &Res,
+    multi_ptr<T, Space, IsDecorated> BaseSrc, size_t RowIndex, size_t ColIndex,
+    size_t Stride, sycl::ext::oneapi::experimental::matrix::layout Layout) {
+#if defined(__SYCL_DEVICE_ONLY__)
+  static_assert(Space != access::address_space::private_space,
+                "Joint Matrix doesn't support load from private memory!");
+#if defined(__NVPTX__)
+  std::ignore = Sg;
+  auto LoadStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex
+                                                : RowIndex + ColIndex * Stride;
+  sycl::ext::oneapi::detail::load_accumulator_cuda(
+      Res.matrix_impl, BaseSrc + LoadStride, Stride, Layout);
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  auto LoadStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex
+                                                : RowIndex + ColIndex * Stride;
+  sycl::ext::oneapi::detail::load_accumulator_hip(
+      Res.matrix_impl, BaseSrc + LoadStride, Stride, Layout, Sg);
+#else
+  std::ignore = Sg;
+  using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
+  DecorT *Ptr = sycl::detail::getDecorated<DecorT>(BaseSrc);
+  Res.spvm = __spirv_CooperativeMatrixLoadOffsetINTEL<
+      DecorT, S, NumRows, NumCols,
+      spv_matrix_use_traits<use::accumulator>::value,
+      spv_matrix_layout_traits<layout::dynamic>::value>(
+      Ptr, RowIndex, ColIndex, sycl::detail::joint_matrix_layout_to_spv(Layout),
+      Stride);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = Sg;
+  std::ignore = Res;
+  std::ignore = BaseSrc;
+  std::ignore = Stride;
+  std::ignore = Layout;
+  throw exception(make_error_code(errc::runtime),
+                  "joint matrix is not supported on host.");
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+template <
+    typename Group, typename S, typename T, use Use, size_t NumRows,
+    size_t NumCols, matrix::layout Layout, access::address_space Space,
+    access::decorated IsDecorated,
+    std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value ||
+                         (std::is_same<S, precision::tf32>::value &&
+                          std::is_same<std::remove_const_t<T>, float>::value),
+                     bool> = true>
+inline __SYCL_ALWAYS_INLINE void
+joint_matrix_load(Group Sg,
+                  joint_matrix<Group, S, Use, NumRows, NumCols, Layout> &Res,
+                  multi_ptr<T, Space, IsDecorated> BaseSrc, size_t RowIndex,
+                  size_t ColIndex, size_t Stride) {
+#if defined(__SYCL_DEVICE_ONLY__)
+  static_assert(Space != access::address_space::private_space,
+                "Joint Matrix doesn't support load from private memory!");
+#if defined(__NVPTX__)
+  std::ignore = Sg;
+  auto LoadStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex
+                                                : RowIndex + ColIndex * Stride;
+  sycl::ext::oneapi::detail::load_multiplicand_cuda<S, T, NumRows, NumCols, Use,
+                                                    Layout, Space>(
+      Res.matrix_impl, BaseSrc + LoadStride, Stride);
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  auto LoadStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex
+                                                : RowIndex + ColIndex * Stride;
+  sycl::ext::oneapi::detail::load_multiplicand_hip<Group, S, T, NumRows,
+                                                   NumCols, Use, Layout, Space>(
+      Res.matrix_impl, BaseSrc + LoadStride, Stride, Sg);
+#else
+  std::ignore = Sg;
+  using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
+  DecorT *Ptr = sycl::detail::getDecorated<DecorT>(BaseSrc);
+  Res.spvm = __spirv_CooperativeMatrixLoadOffsetINTEL<
+      DecorT, S, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
+      spv_matrix_layout_traits<Layout>::value>(
+      Ptr, RowIndex, ColIndex, spv_matrix_layout_traits<Layout>::value, Stride);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = Sg;
+  std::ignore = Res;
+  std::ignore = BaseSrc;
+  std::ignore = Stride;
+  throw exception(make_error_code(errc::runtime),
+                  "joint matrix is not supported on host.");
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+template <typename Group, typename S, typename T, size_t NumRows,
+          size_t NumCols, typename PropertyListT,
+          std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value,
+                           bool> = true>
+inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
+    Group Sg,
+    joint_matrix<Group, S, use::accumulator, NumRows, NumCols,
+                 sycl::ext::oneapi::experimental::matrix::layout::dynamic> &Res,
+    ext::oneapi::experimental::annotated_ptr<T, PropertyListT> BaseSrc,
+    size_t RowIndex, size_t ColIndex, size_t Stride,
+    sycl::ext::oneapi::experimental::matrix::layout Layout) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = Sg;
+  throw exception(make_error_code(errc::runtime),
+                  "Use joint_matrix_load on multi_ptr on Nvidia device.");
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  throw exception(make_error_code(errc::runtime),
+                  "Use joint_matrix_load on multi_ptr on AMD device.");
+#else
+  std::ignore = Sg;
+  T *Ptr = BaseSrc.get();
+  Res.spvm = __spirv_CooperativeMatrixLoadOffsetINTEL<
+      T, S, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
+      spv_matrix_layout_traits<layout::dynamic>::value>(
+      Ptr, RowIndex, ColIndex, Stride,
+      sycl::detail::joint_matrix_layout_to_spv(Layout),
+      spv_scope_traits<Group>::value);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = Sg;
+  std::ignore = Res;
+  std::ignore = BaseSrc;
+  std::ignore = Stride;
+  std::ignore = Layout;
+  throw exception(make_error_code(errc::runtime),
+                  "joint matrix is not supported on host.");
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+template <
+    typename Group, typename S, typename T, use Use, size_t NumRows,
+    size_t NumCols, matrix::layout Layout, typename PropertyListT,
+    std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value ||
+                         (std::is_same<S, precision::tf32>::value &&
+                          std::is_same<std::remove_const_t<T>, float>::value),
+                     bool> = true>
+inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
+    Group Sg, joint_matrix<Group, S, Use, NumRows, NumCols, Layout> &Res,
+    ext::oneapi::experimental::annotated_ptr<T, PropertyListT> BaseSrc,
+    size_t RowIndex, size_t ColIndex, size_t Stride) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = Sg;
+  throw exception(make_error_code(errc::runtime),
+                  "Use joint_matrix_load on multi_ptr on Nvidia device.");
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  throw exception(make_error_code(errc::runtime),
+                  "Use joint_matrix_load on multi_ptr on AMD device.");
+#else
+  std::ignore = Sg;
+  T *Ptr = BaseSrc.get();
+  Res.spvm = __spirv_CooperativeMatrixLoadOffsetINTEL<
+      T, S, NumRows, NumCols, spv_matrix_use_traits<Use>::value,
+      spv_matrix_layout_traits<Layout>::value>(
+      Ptr, RowIndex, ColIndex, Stride, spv_matrix_layout_traits<Layout>::value,
+      spv_scope_traits<Group>::value);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = Sg;
+  std::ignore = Res;
+  std::ignore = BaseSrc;
+  std::ignore = Stride;
+  throw exception(make_error_code(errc::runtime),
+                  "joint matrix is not supported on host.");
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+// End offset load overloads
+
 template <typename Group, typename T, size_t NumRows, size_t NumCols,
           access::address_space Space, access::decorated IsDecorated>
 inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
@@ -472,6 +647,92 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
 #endif // defined(__SYCL_DEVICE_ONLY__)
 }
 
+// Begin offset store overloads
+template <typename Group, typename T, size_t NumRows, size_t NumCols,
+          access::address_space Space, access::decorated IsDecorated>
+inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
+    Group Sg,
+    const joint_matrix<Group, T, use::accumulator, NumRows, NumCols,
+                       sycl::ext::oneapi::experimental::matrix::layout::dynamic>
+        &Src,
+    multi_ptr<T, Space, IsDecorated> BaseDst, size_t RowIndex, size_t ColIndex,
+    size_t Stride, sycl::ext::oneapi::experimental::matrix::layout Layout) {
+#if defined(__SYCL_DEVICE_ONLY__)
+  static_assert(Space != access::address_space::private_space,
+                "Joint Matrix doesn't support store to private memory!");
+#if defined(__NVPTX__)
+  std::ignore = Sg;
+  auto StoreStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex
+                                                 : RowIndex + ColIndex * Stride;
+  sycl::ext::oneapi::detail::joint_matrix_store_cuda<T, NumRows, NumCols,
+                                                     Space>(
+      Src.matrix_impl, BaseDst + StoreStride, Stride, Layout);
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  auto StoreStride = Layout == layout::row_major ? RowIndex * Stride + ColIndex
+                                                 : RowIndex + ColIndex * Stride;
+  sycl::ext::oneapi::detail::joint_matrix_store_hip<Group, T, NumRows, NumCols,
+                                                    Space>(
+      Src.matrix_impl, BaseDst + StoreStride, Stride, Layout, Sg);
+#else
+  std::ignore = Sg;
+  using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
+  DecorT *Ptr = sycl::detail::getDecorated<DecorT>(BaseDst);
+  __spirv_CooperativeMatrixStoreOffsetINTEL<
+      DecorT, T, NumRows, NumCols,
+      spv_matrix_use_traits<use::accumulator>::value,
+      spv_matrix_layout_traits<layout::dynamic>::value>(
+      Ptr, RowIndex, ColIndex, Src.spvm,
+      sycl::detail::joint_matrix_layout_to_spv(Layout), Stride);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = Sg;
+  std::ignore = Src;
+  std::ignore = BaseDst;
+  std::ignore = Stride;
+  std::ignore = Layout;
+  throw exception(make_error_code(errc::runtime),
+                  "joint matrix is not supported on host.");
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+
+template <typename Group, typename T, size_t NumRows, size_t NumCols,
+          typename PropertyListT>
+inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
+    Group Sg,
+    const joint_matrix<Group, T, use::accumulator, NumRows, NumCols,
+                       sycl::ext::oneapi::experimental::matrix::layout::dynamic>
+        &Src,
+    ext::oneapi::experimental::annotated_ptr<T, PropertyListT> BaseDst,
+    size_t RowIndex, size_t ColIndex, size_t Stride,
+    sycl::ext::oneapi::experimental::matrix::layout Layout) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+  std::ignore = Sg;
+  throw exception(make_error_code(errc::runtime),
+                  "Use joint_matrix_store on multi_ptr on Nvidia device.");
+#elif defined(__HIP_PLATFORM_AMD_MFMA__)
+  throw exception(make_error_code(errc::runtime),
+                  "Use joint_matrix_store on multi_ptr on AMD device.");
+#else
+  std::ignore = Sg;
+  T *Ptr = BaseDst.get();
+  __spirv_CooperativeMatrixStoreKHR<
+      T, T, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
+      spv_matrix_layout_traits<layout::dynamic>::value>(
+      Ptr, Src.spvm, RowIndex, ColIndex,
+      sycl::detail::joint_matrix_layout_to_spv(Layout), Stride);
+#endif // defined(__NVPTX__)
+#else
+  std::ignore = Sg;
+  std::ignore = Src;
+  std::ignore = BaseDst;
+  std::ignore = Stride;
+  std::ignore = Layout;
+  throw exception(make_error_code(errc::runtime),
+                  "joint matrix is not supported on host.");
+#endif // defined(__SYCL_DEVICE_ONLY__)
+}
+// End offset store overloads
 template <typename Group, typename Ta, typename Tb, typename Tc, typename Td,
           std::size_t M, std::size_t K, std::size_t N, layout LayoutA,
           layout LayoutB>
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
index d1fbd4cbda15b..638ad25c46839 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
@@ -10,5 +10,8 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
+// RUN: %{build} -o %toff.out -DOFFSET
+// RUN: %{run} %toff.out
+
 #include "common.hpp"
 #include "joint_matrix_bfloat16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
index fc1fc1963e047..2560dfd46e2c6 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
@@ -48,13 +48,29 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                         layout::ext_intel_packed>
                sub_b;
            joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
-
+#ifdef OFFSET
+           joint_matrix_load(
+               sg, sub_c, accC.template get_multi_ptr<access::decorated::no>(),
+               sg_startx * TM, sg_starty / sg_size * TN, N, layout::row_major);
+#else
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
                    (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
+#endif
            for (int k = 0; k < K / TK; k += 1) { //
+#ifdef OFFSET
+             joint_matrix_load(
+                 sg, sub_a,
+                 accA.template get_multi_ptr<access::decorated::no>(),
+                 sg_startx * TM, k * TK, K);
+             joint_matrix_load(
+                 sg, sub_b,
+                 accB.template get_multi_ptr<access::decorated::no>(),
+                 k * TK / 2, sg_starty / sg_size * TN * 2, N * 2);
+
+#else
              joint_matrix_load(
                  sg, sub_a,
                  accA.template get_multi_ptr<access::decorated::no>() +
@@ -65,13 +81,20 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                  accB.template get_multi_ptr<access::decorated::no>() +
                      (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
                  N * 2);
+#endif
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
+#ifdef OFFSET
+           joint_matrix_store(
+               sg, sub_c, accC.template get_multi_ptr<access::decorated::no>(),
+               sg_startx * TM, sg_starty / sg_size * TN, N, layout::row_major);
+#else
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
                    (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
+#endif
          }); // parallel for
    }).wait();
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
index 230a20a62e1c7..453a04eb99d09 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
@@ -167,28 +167,43 @@ void test(queue &q) {
                 sub_c;
             joint_matrix<sycl::sub_group, Td, use::accumulator, M, N> sub_d;
             auto stride_C = layout_C == layout::row_major ? Big_N : Big_M;
+#ifdef OFFSET
+
+            joint_matrix_load(
+                sg, sub_c, accC.template get_multi_ptr<access::decorated::no>(),
+                m * M, n * N, stride_C, layout_C);
+#else
             auto load_stride_C = layout_C == layout::row_major
                                      ? (m * M) * Big_N + n * N
                                      : (m * M) + n * N * Big_M;
-
             joint_matrix_load(
                 sg, sub_c,
                 accC.template get_multi_ptr<access::decorated::no>() +
                     load_stride_C,
                 stride_C, layout_C);
-
+#endif
             auto stride_A = layout_A == layout::row_major ? Big_K : Big_M;
             auto stride_B = layout_B == layout::row_major ? Big_N : Big_K;
 
             // k = row/col id of current submatrix of BIG A/B matrices
             for (int k = 0; k < Sub_Tiles_K; k++) {
+#ifdef OFFSET
+              joint_matrix_load(
+                  sg, sub_a,
+                  accA.template get_multi_ptr<access::decorated::no>(), m * M,
+                  k * K, stride_A);
+
+              joint_matrix_load(
+                  sg, sub_b,
+                  accB.template get_multi_ptr<access::decorated::no>(), k * K,
+                  n * N, stride_B);
+#else
               auto load_stride_A = layout_A == layout::row_major
                                        ? (k * K) + (m * M * Big_K)
                                        : (k * K * Big_M) + (m * M);
               auto load_stride_B = layout_B == layout::row_major
                                        ? (k * K * Big_N) + (n * N)
                                        : (k * K) + (n * N * Big_K);
-
               joint_matrix_load(
                   sg, sub_a,
                   accA.template get_multi_ptr<access::decorated::no>() +
@@ -200,7 +215,7 @@ void test(queue &q) {
                   accB.template get_multi_ptr<access::decorated::no>() +
                       load_stride_B,
                   stride_B);
-
+#endif
               // round values to correct precision if using tf32
               if constexpr (std::is_same<T3, precision::tf32>::value) {
                 auto round_lambda = [](auto &x) { x = round_to_tf32(x); };
@@ -211,11 +226,17 @@ void test(queue &q) {
               joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c);
               joint_matrix_copy(sg, sub_d, sub_c);
             }
+#ifdef OFFSET
             joint_matrix_store(
+                sg, sub_d, accD.template get_multi_ptr<access::decorated::no>(),
+                m * M, n * N, stride_C, layout_C);
+#else
+	                joint_matrix_store(
                 sg, sub_d,
                 accD.template get_multi_ptr<access::decorated::no>() +
                     load_stride_C,
                 stride_C, layout_C);
+#endif
           });
     });
     q.wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
index b383c07f018a6..8e9afe414874c 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
@@ -10,6 +10,9 @@
 // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %t.out
 // RUN: %{run} %t.out
 //
+// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET
+// RUN: %{run} %toff.out
+//
 // This tests the unified matrix extension interfaces for the cuda backend.
 // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
 // where sm_xx >= sm_70.
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
index 565bec3c69972..c0d26a9600409 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
@@ -10,6 +10,9 @@
 // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_72 -o %t.out
 // RUN: %{run} %t.out
 //
+// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET
+// RUN: %{run} %toff.out
+//
 // This tests the unified matrix extension interfaces for the cuda backend.
 // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
 // where sm_xx >= sm_72.
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
index c872665ba0532..94680aa2523a0 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
@@ -10,6 +10,9 @@
 // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_80 -o %t.out
 // RUN: %{run} %t.out
 //
+// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET
+// RUN: %{run} %toff.out
+//
 // This tests the unified matrix extension interfaces for the cuda backend.
 // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
 // where sm_xx >= sm_80.