Skip to content

Commit 2b7c61e

Browse files
committed
Add the new offset API to CUDA test
1 parent 3565a3e commit 2b7c61e

File tree

5 files changed

+38
-8
lines changed

5 files changed

+38
-8
lines changed

sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
486486
sycl::ext::oneapi::experimental::matrix::layout Layout) {
487487
#if defined(__SYCL_DEVICE_ONLY__)
488488
#if defined(__NVPTX__)
489-
std::ignore = sg;
489+
std::ignore = Sg;
490490
throw exception(make_error_code(errc::runtime),
491491
"Use joint_matrix_load on multi_ptr on Nvidia device.");
492492
#elif defined(__HIP_PLATFORM_AMD_MFMA__)
@@ -526,7 +526,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
526526
size_t RowIndex, size_t ColIndex, size_t Stride) {
527527
#if defined(__SYCL_DEVICE_ONLY__)
528528
#if defined(__NVPTX__)
529-
std::ignore = sg;
529+
std::ignore = Sg;
530530
throw exception(make_error_code(errc::runtime),
531531
"Use joint_matrix_load on multi_ptr on Nvidia device.");
532532
#elif defined(__HIP_PLATFORM_AMD_MFMA__)
@@ -672,7 +672,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
672672
: RowIndex + ColIndex * Stride;
673673
sycl::ext::oneapi::detail::joint_matrix_store_hip<Group, T, NumRows, NumCols,
674674
Space>(
675-
Src.matrix_impl, BaseDst + StoreStride, Stride, Layout, sg);
675+
Src.matrix_impl, BaseDst + StoreStride, Stride, Layout, Sg);
676676
#else
677677
std::ignore = Sg;
678678
using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
@@ -707,7 +707,7 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
707707
sycl::ext::oneapi::experimental::matrix::layout Layout) {
708708
#if defined(__SYCL_DEVICE_ONLY__)
709709
#if defined(__NVPTX__)
710-
std::ignore = sg;
710+
std::ignore = Sg;
711711
throw exception(make_error_code(errc::runtime),
712712
"Use joint_matrix_store on multi_ptr on Nvidia device.");
713713
#elif defined(__HIP_PLATFORM_AMD_MFMA__)

sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,28 +167,43 @@ void test(queue &q) {
167167
sub_c;
168168
joint_matrix<sycl::sub_group, Td, use::accumulator, M, N> sub_d;
169169
auto stride_C = layout_C == layout::row_major ? Big_N : Big_M;
170+
#ifdef OFFSET
171+
172+
joint_matrix_load(
173+
sg, sub_c, accC.template get_multi_ptr<access::decorated::no>(),
174+
m * M, n * N, stride_C, layout_C);
175+
#else
170176
auto load_stride_C = layout_C == layout::row_major
171177
? (m * M) * Big_N + n * N
172178
: (m * M) + n * N * Big_M;
173-
174179
joint_matrix_load(
175180
sg, sub_c,
176181
accC.template get_multi_ptr<access::decorated::no>() +
177182
load_stride_C,
178183
stride_C, layout_C);
179-
184+
#endif
180185
auto stride_A = layout_A == layout::row_major ? Big_K : Big_M;
181186
auto stride_B = layout_B == layout::row_major ? Big_N : Big_K;
182187

183188
// k = row/col id of current submatrix of BIG A/B matrices
184189
for (int k = 0; k < Sub_Tiles_K; k++) {
190+
#ifdef OFFSET
191+
joint_matrix_load(
192+
sg, sub_a,
193+
accA.template get_multi_ptr<access::decorated::no>(), m * M,
194+
k * K, stride_A);
195+
196+
joint_matrix_load(
197+
sg, sub_b,
198+
accB.template get_multi_ptr<access::decorated::no>(), k * K,
199+
n * N, load_stride_B, stride_B);
200+
#else
185201
auto load_stride_A = layout_A == layout::row_major
186202
? (k * K) + (m * M * Big_K)
187203
: (k * K * Big_M) + (m * M);
188204
auto load_stride_B = layout_B == layout::row_major
189205
? (k * K * Big_N) + (n * N)
190206
: (k * K) + (n * N * Big_K);
191-
192207
joint_matrix_load(
193208
sg, sub_a,
194209
accA.template get_multi_ptr<access::decorated::no>() +
@@ -200,7 +215,7 @@ void test(queue &q) {
200215
accB.template get_multi_ptr<access::decorated::no>() +
201216
load_stride_B,
202217
stride_B);
203-
218+
#endif
204219
// round values to correct precision if using tf32
205220
if constexpr (std::is_same<T3, precision::tf32>::value) {
206221
auto round_lambda = [](auto &x) { x = round_to_tf32(x); };
@@ -211,11 +226,17 @@ void test(queue &q) {
211226
joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c);
212227
joint_matrix_copy(sg, sub_d, sub_c);
213228
}
229+
#ifdef OFFSET
214230
joint_matrix_store(
231+
sg, sub_d, accD.template get_multi_ptr<access::decorated::no>(),
232+
m * M, n * N, stride_C, layout_C);
233+
#else
234+
joint_matrix_store(
215235
sg, sub_d,
216236
accD.template get_multi_ptr<access::decorated::no>() +
217237
load_stride_C,
218238
stride_C, layout_C);
239+
#endif
219240
});
220241
});
221242
q.wait();

sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %t.out
1111
// RUN: %{run} %t.out
1212
//
13+
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET
14+
// RUN: %{run} %toff.out
15+
//
1316
// This tests the unified matrix extension interfaces for the cuda backend.
1417
// This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
1518
// where sm_xx >= sm_70.

sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_72 -o %t.out
1111
// RUN: %{run} %t.out
1212
//
13+
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET
14+
// RUN: %{run} %toff.out
15+
//
1316
// This tests the unified matrix extension interfaces for the cuda backend.
1417
// This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
1518
// where sm_xx >= sm_72.

sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_80 -o %t.out
1111
// RUN: %{run} %t.out
1212
//
13+
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %toff.out -DOFFSET
14+
// RUN: %{run} %toff.out
15+
//
1316
// This tests the unified matrix extension interfaces for the cuda backend.
1417
// This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
1518
// where sm_xx >= sm_80.

0 commit comments

Comments
 (0)