diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp index a17ff03563ae..9dd71c95d7a5 100644 --- a/dpnp/backend/extensions/blas/blas_py.cpp +++ b/dpnp/backend/extensions/blas/blas_py.cpp @@ -134,6 +134,13 @@ PYBIND11_MODULE(_blas_impl, m) py::arg("device")); } + { + m.def("_is_16_bytes_aligned", &blas_ns::_is_16_bytes_aligned, + "Return ``True`` if pointer on USM allocation has 16 bytes " + "alignment in memory", + py::arg("a")); + } + { m.def("_gemm_batch", &blas_ns::gemm_batch, "Call `gemm_batch` from OneMKL BLAS library to compute " diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp index 42487eb9286a..b2f8fb4af757 100644 --- a/dpnp/backend/extensions/blas/gemm.cpp +++ b/dpnp/backend/extensions/blas/gemm.cpp @@ -26,6 +26,7 @@ #include // dpctl tensor headers +#include "kernels/alignment.hpp" #include "utils/memory_overlap.hpp" #include "utils/output_validation.hpp" #include "utils/type_utils.hpp" @@ -339,6 +340,12 @@ bool _is_lnl_bm_architecture(const sycl::device &dev) return false; } +bool _is_16_bytes_aligned(const dpctl::tensor::usm_ndarray &a) +{ + return dpctl::tensor::kernels::alignment_utils::is_aligned<16>( + a.get_data()); +} + template struct GemmContigFactory { diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp index fed42755753e..a65e57b8da39 100644 --- a/dpnp/backend/extensions/blas/gemm.hpp +++ b/dpnp/backend/extensions/blas/gemm.hpp @@ -40,6 +40,7 @@ extern std::tuple const std::vector &depends); extern bool _is_lnl_bm_architecture(const sycl::device &dev); +extern bool _is_16_bytes_aligned(const dpctl::tensor::usm_ndarray &a); extern std::tuple gemm_batch(sycl::queue &exec_q, diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index f65ad3737b41..02a1adc6ab07 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -897,27 +897,19 @@ def dpnp_matmul( # MKLD-17976: due to known issue in OneMKL on Lunar Lake and # Battlemage G21 Intel GPU architectures, it forces # to implement a temporary workaround with extra copying of - # an input array in case when it has a small size and - # non-zero offset - # The issue was detected by failing tests for eig/eigh + # an input array in case when it does not have 16 bytes + # alignment in the memory. # TODO: remove the workaround once OneMKL issue is resolved if bi._is_lnl_bm_architecture(exec_q.get_sycl_device()): - - def _need_to_copy(a): - a_usm = dpnp.get_usm_ndarray(a) - if a_usm._element_offset > 0 and a_usm.size < 16: - return True - return False - x1 = _copy_array( x1, - copy_flag=_need_to_copy(x1), + copy_flag=bi._is_16_bytes_aligned(x1), dtype=compute_dtype, order=res_order, ) x2 = _copy_array( x2, - copy_flag=_need_to_copy(x2), + copy_flag=bi._is_16_bytes_aligned(x2), dtype=compute_dtype, order=res_order, ) @@ -929,6 +921,26 @@ def _need_to_copy(a): result, ) else: # call_flag == "gemm_batch" + # MKLD-17976: due to known issue in OneMKL on Lunar Lake and + # Battlemage G21 Intel GPU architectures, it forces + # to implement a temporary workaround with extra copying of + # an input array in case when it does not have 16 bytes + # alignment in the memory. + # TODO: remove the workaround once OneMKL issue is resolved + if bi._is_lnl_bm_architecture(exec_q.get_sycl_device()): + x1 = _copy_array( + x1, + copy_flag=bi._is_16_bytes_aligned(x1), + dtype=compute_dtype, + order=res_order, + ) + x2 = _copy_array( + x2, + copy_flag=bi._is_16_bytes_aligned(x2), + dtype=compute_dtype, + order=res_order, + ) + result = _gemm_batch_matmul( exec_q, x1, diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py index dbaab556e0c2..d68e6e7c4c87 100644 --- a/tests/test_mathematical.py +++ b/tests/test_mathematical.py @@ -3824,6 +3824,32 @@ def test_matmul_alias(self): result2 = dpnp.linalg.matmul(a, b) assert_array_equal(result1, result2) + @pytest.mark.parametrize( + "sh1, sh2", + [ + ((2, 3, 3), (3, 3)), + ((3, 4, 4, 4), (4, 4, 4)), + ], + ids=["gemm", "gemm_batch"], + ) + def test_matmul_with_offsets(self, sh1, sh2): + size1, size2 = numpy.prod(sh1, dtype=int), numpy.prod(sh2, dtype=int) + a = numpy.random.randint(-5, 5, size1).reshape(sh1) + b = numpy.random.randint(-5, 5, size2).reshape(sh2) + ia, ib = dpnp.array(a), dpnp.array(b) + + result = ia[1] @ ib + expected = a[1] @ b + assert_array_equal(result, expected) + + result = ib @ ia[1] + expected = b @ a[1] + assert_array_equal(result, expected) + + result = ia[1] @ ia[1] + expected = a[1] @ a[1] + assert_array_equal(result, expected) + class TestMatmulInvalidCases: @pytest.mark.parametrize(