Add the new offset API to CUDA test

dkhaldi · dkhaldi · commit 2b7c61ec0f2a · 2024-09-26T14:06:38.000-07:00
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -486,7 +486,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
     sycl::ext::oneapi::experimental::matrix::layout Layout) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
-  std::ignore = sg;
+  std::ignore = Sg;
   throw exception(make_error_code(errc::runtime),
                   "Use joint_matrix_load on multi_ptr on Nvidia device.");
 #elif defined(__HIP_PLATFORM_AMD_MFMA__)
@@ -526,7 +526,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
     size_t RowIndex, size_t ColIndex, size_t Stride) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
-  std::ignore = sg;
+  std::ignore = Sg;
   throw exception(make_error_code(errc::runtime),
                   "Use joint_matrix_load on multi_ptr on Nvidia device.");
 #elif defined(__HIP_PLATFORM_AMD_MFMA__)
@@ -672,7 +672,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
                                                  : RowIndex + ColIndex * Stride;
   sycl::ext::oneapi::detail::joint_matrix_store_hip<Group, T, NumRows, NumCols,
                                                     Space>(
-      Src.matrix_impl, BaseDst + StoreStride, Stride, Layout, sg);
+      Src.matrix_impl, BaseDst + StoreStride, Stride, Layout, Sg);
 #else
   std::ignore = Sg;
   using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
@@ -707,7 +707,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
     sycl::ext::oneapi::experimental::matrix::layout Layout) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
-  std::ignore = sg;
+  std::ignore = Sg;
   throw exception(make_error_code(errc::runtime),
                   "Use joint_matrix_store on multi_ptr on Nvidia device.");
 #elif defined(__HIP_PLATFORM_AMD_MFMA__)
diff --git a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
@@ -167,28 +167,43 @@ void test(queue &q) {
                 sub_c;
             joint_matrix<sycl::sub_group, Td, use::accumulator, M, N> sub_d;
             auto stride_C = layout_C == layout::row_major ? Big_N : Big_M;
+#ifdef OFFSET
+
+            joint_matrix_load(
+                sg, sub_c, accC.template get_multi_ptr<access::decorated::no>(),
+                m * M, n * N, stride_C, layout_C);
+#else
             auto load_stride_C = layout_C == layout::row_major
                                      ? (m * M) * Big_N + n * N
                                      : (m * M) + n * N * Big_M;
-
             joint_matrix_load(
                 sg, sub_c,
                 accC.template get_multi_ptr<access::decorated::no>() +
                     load_stride_C,
                 stride_C, layout_C);
-
+#endif
             auto stride_A = layout_A == layout::row_major ? Big_K : Big_M;
             auto stride_B = layout_B == layout::row_major ? Big_N : Big_K;
 
             // k = row/col id of current submatrix of BIG A/B matrices
             for (int k = 0; k < Sub_Tiles_K; k++) {
+#ifdef OFFSET
+              joint_matrix_load(
+                  sg, sub_a,
+                  accA.template get_multi_ptr<access::decorated::no>(), m * M,
+                  k * K, stride_A);
+
+              joint_matrix_load(
+                  sg, sub_b,
+                  accB.template get_multi_ptr<access::decorated::no>(), k * K,
+                  n * N, load_stride_B, stride_B);
+#else
               auto load_stride_A = layout_A == layout::row_major
                                        ? (k * K) + (m * M * Big_K)
                                        : (k * K * Big_M) + (m * M);
               auto load_stride_B = layout_B == layout::row_major
                                        ? (k * K * Big_N) + (n * N)
                                        : (k * K) + (n * N * Big_K);
-
               joint_matrix_load(
                   sg, sub_a,
                   accA.template get_multi_ptr<access::decorated::no>() +
@@ -200,7 +215,7 @@ void test(queue &q) {
                   accB.template get_multi_ptr<access::decorated::no>() +
                       load_stride_B,
                   stride_B);
-
+#endif
               // round values to correct precision if using tf32
               if constexpr (std::is_same<T3, precision::tf32>::value) {
                 auto round_lambda = [](auto &x) { x = round_to_tf32(x); };
@@ -211,11 +226,17 @@ void test(queue &q) {
               joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c);
               joint_matrix_copy(sg, sub_d, sub_c);
             }
+#ifdef OFFSET
             joint_matrix_store(
+                sg, sub_d, accD.template get_multi_ptr<access::decorated::no>(),
+                m * M, n * N, stride_C, layout_C);
+#else
+	                joint_matrix_store(
                 sg, sub_d,
                 accD.template get_multi_ptr<access::decorated::no>() +
                     load_stride_C,
                 stride_C, layout_C);
+#endif
           });
     });
     q.wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
@@ -10,6 +10,9 @@
 // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %t.out
 // RUN: %{run} %t.out
 //
+// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET
+// RUN: %{run} %toff.out
+//
 // This tests the unified matrix extension interfaces for the cuda backend.
 // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
 // where sm_xx >= sm_70.
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp
@@ -10,6 +10,9 @@
 // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_72 -o %t.out
 // RUN: %{run} %t.out
 //
+// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET
+// RUN: %{run} %toff.out
+//
 // This tests the unified matrix extension interfaces for the cuda backend.
 // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
 // where sm_xx >= sm_72.
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp
@@ -10,6 +10,9 @@
 // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_80 -o %t.out
 // RUN: %{run} %t.out
 //
+// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET
+// RUN: %{run} %toff.out
+//
 // This tests the unified matrix extension interfaces for the cuda backend.
 // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
 // where sm_xx >= sm_80.